1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
9 Written by Philip Hazel
10 Copyright (c) 1997-2012 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
79 #define NLBLOCK md /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
83 #include "pcre_internal.h"
86 /* For use to indent debugging output */
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
115 static const pcre_uint8 coptable
[] = {
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* upto, minupto */
131 1+IMM2_SIZE
, /* exact */
132 1, 1, 1, 1+IMM2_SIZE
, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* upto I, minupto I */
135 1+IMM2_SIZE
, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE
, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* NOT upto, minupto */
140 1+IMM2_SIZE
, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE
, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* NOT upto I, minupto I */
144 1+IMM2_SIZE
, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE
, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* Type upto, minupto */
149 1+IMM2_SIZE
, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE
, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
156 0, /* XCLASS - variable length */
169 0, /* Assert behind */
170 0, /* Assert behind not */
171 0, 0, /* ONCE, ONCE_NC */
172 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
173 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
174 0, 0, /* CREF, NCREF */
175 0, 0, /* RREF, NRREF */
177 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
178 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
179 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
180 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
181 0, 0 /* CLOSE, SKIPZERO */
184 /* This table identifies those opcodes that inspect a character. It is used to
185 remember the fact that a character could have been inspected when the end of
186 the subject is reached. ***NOTE*** If the start of this table is modified, the
187 two tables that follow must also be modified. */
189 static const pcre_uint8 poptable
[] = {
191 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
192 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
193 1, 1, 1, /* Any, AllAny, Anybyte */
195 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
197 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
202 /* Positive single-char repeats */
203 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
204 1, 1, 1, /* upto, minupto, exact */
205 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
206 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
207 1, 1, 1, /* upto I, minupto I, exact I */
208 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
209 /* Negative single-char repeats - only for chars < 256 */
210 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
211 1, 1, 1, /* NOT upto, minupto, exact */
212 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
213 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
214 1, 1, 1, /* NOT upto I, minupto I, exact I */
215 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
216 /* Positive type repeats */
217 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
218 1, 1, 1, /* Type upto, minupto, exact */
219 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
220 /* Character class & ref repeats */
221 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
222 1, 1, /* CRRANGE, CRMINRANGE */
225 1, /* XCLASS - variable length */
238 0, /* Assert behind */
239 0, /* Assert behind not */
240 0, 0, /* ONCE, ONCE_NC */
241 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
242 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
243 0, 0, /* CREF, NCREF */
244 0, 0, /* RREF, NRREF */
246 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
247 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
248 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
249 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
250 0, 0 /* CLOSE, SKIPZERO */
253 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
256 static const pcre_uint8 toptable1
[] = {
258 ctype_digit
, ctype_digit
,
259 ctype_space
, ctype_space
,
260 ctype_word
, ctype_word
,
261 0, 0 /* OP_ANY, OP_ALLANY */
264 static const pcre_uint8 toptable2
[] = {
269 1, 1 /* OP_ANY, OP_ALLANY */
273 /* Structure for holding data about a particular state, which is in effect the
274 current data for an active path through the match tree. It must consist
275 entirely of ints because the working vector we are passed, and which we put
276 these structures in, is a vector of ints. */
278 typedef struct stateblock
{
279 int offset
; /* Offset to opcode */
280 int count
; /* Count for repeats */
281 int data
; /* Some use extra data */
284 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
288 /*************************************************
289 * Print character string *
290 *************************************************/
292 /* Character string printing function for debugging.
296 length number of bytes
303 pchars(const pcre_uchar
*p
, int length
, FILE *f
)
308 if (isprint(c
= *(p
++)))
311 fprintf(f
, "\\x%02x", c
);
318 /*************************************************
319 * Execute a Regular Expression - DFA engine *
320 *************************************************/
322 /* This internal function applies a compiled pattern to a subject string,
323 starting at a given point, using a DFA engine. This function is called from the
324 external one, possibly multiple times if the pattern is not anchored. The
325 function calls itself recursively for some kinds of subpattern.
328 md the match_data block with fixed information
329 this_start_code the opening bracket of this subexpression's code
330 current_subject where we currently are in the subject string
331 start_offset start offset in the subject string
332 offsets vector to contain the matching string offsets
333 offsetcount size of same
334 workspace vector of workspace
336 rlevel function call recursion level
338 Returns: > 0 => number of match offset pairs placed in offsets
339 = 0 => offsets overflowed; longest matches are present
340 -1 => failed to match
341 < -1 => some kind of unexpected problem
343 The following macros are used for adding states to the two state vectors (one
344 for the current character, one for the following character). */
346 #define ADD_ACTIVE(x,y) \
347 if (active_count++ < wscount) \
349 next_active_state->offset = (x); \
350 next_active_state->count = (y); \
351 next_active_state++; \
352 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354 else return PCRE_ERROR_DFA_WSSIZE
356 #define ADD_ACTIVE_DATA(x,y,z) \
357 if (active_count++ < wscount) \
359 next_active_state->offset = (x); \
360 next_active_state->count = (y); \
361 next_active_state->data = (z); \
362 next_active_state++; \
363 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
365 else return PCRE_ERROR_DFA_WSSIZE
367 #define ADD_NEW(x,y) \
368 if (new_count++ < wscount) \
370 next_new_state->offset = (x); \
371 next_new_state->count = (y); \
373 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375 else return PCRE_ERROR_DFA_WSSIZE
377 #define ADD_NEW_DATA(x,y,z) \
378 if (new_count++ < wscount) \
380 next_new_state->offset = (x); \
381 next_new_state->count = (y); \
382 next_new_state->data = (z); \
384 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385 (x), (y), (z), __LINE__)); \
387 else return PCRE_ERROR_DFA_WSSIZE
389 /* And now, here is the code */
394 const pcre_uchar
*this_start_code
,
395 const pcre_uchar
*current_subject
,
403 stateblock
*active_states
, *new_states
, *temp_states
;
404 stateblock
*next_active_state
, *next_new_state
;
406 const pcre_uint8
*ctypes
, *lcc
, *fcc
;
407 const pcre_uchar
*ptr
;
408 const pcre_uchar
*end_code
, *first_op
;
410 dfa_recursion_info new_recursive
;
412 int active_count
, new_count
, match_count
;
414 /* Some fields in the md block are frequently referenced, so we load them into
415 independent variables in the hope that this will perform better. */
417 const pcre_uchar
*start_subject
= md
->start_subject
;
418 const pcre_uchar
*end_subject
= md
->end_subject
;
419 const pcre_uchar
*start_code
= md
->start_code
;
422 BOOL utf
= (md
->poptions
& PCRE_UTF8
) != 0;
427 BOOL reset_could_continue
= FALSE
;
433 wscount
= (wscount
- (wscount
% (INTS_PER_STATEBLOCK
* 2))) /
434 (2 * INTS_PER_STATEBLOCK
);
436 DPRINTF(("\n%.*s---------------------\n"
437 "%.*sCall to internal_dfa_exec f=%d\n",
438 rlevel
*2-2, SP
, rlevel
*2-2, SP
, rlevel
));
440 ctypes
= md
->tables
+ ctypes_offset
;
441 lcc
= md
->tables
+ lcc_offset
;
442 fcc
= md
->tables
+ fcc_offset
;
444 match_count
= PCRE_ERROR_NOMATCH
; /* A negative number */
446 active_states
= (stateblock
*)(workspace
+ 2);
447 next_new_state
= new_states
= active_states
+ wscount
;
450 first_op
= this_start_code
+ 1 + LINK_SIZE
+
451 ((*this_start_code
== OP_CBRA
|| *this_start_code
== OP_SCBRA
||
452 *this_start_code
== OP_CBRAPOS
|| *this_start_code
== OP_SCBRAPOS
)
455 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456 the alternative states onto the list, and find out where the end is. This
457 makes is possible to use this function recursively, when we want to stop at a
458 matching internal ket rather than at the end.
460 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
461 a backward assertion. In that case, we have to find out the maximum amount to
462 move back, and set up each alternative appropriately. */
464 if (*first_op
== OP_REVERSE
)
469 end_code
= this_start_code
;
472 int back
= GET(end_code
, 2+LINK_SIZE
);
473 if (back
> max_back
) max_back
= back
;
474 end_code
+= GET(end_code
, 1);
476 while (*end_code
== OP_ALT
);
478 /* If we can't go back the amount required for the longest lookbehind
479 pattern, go back as far as we can; some alternatives may still be viable. */
482 /* In character mode we have to step back character by character */
486 for (gone_back
= 0; gone_back
< max_back
; gone_back
++)
488 if (current_subject
<= start_subject
) break;
490 ACROSSCHAR(current_subject
> start_subject
, *current_subject
, current_subject
--);
496 /* In byte-mode we can do this quickly. */
499 gone_back
= (current_subject
- max_back
< start_subject
)?
500 (int)(current_subject
- start_subject
) : max_back
;
501 current_subject
-= gone_back
;
504 /* Save the earliest consulted character */
506 if (current_subject
< md
->start_used_ptr
)
507 md
->start_used_ptr
= current_subject
;
509 /* Now we can process the individual branches. */
511 end_code
= this_start_code
;
514 int back
= GET(end_code
, 2+LINK_SIZE
);
515 if (back
<= gone_back
)
517 int bstate
= (int)(end_code
- start_code
+ 2 + 2*LINK_SIZE
);
518 ADD_NEW_DATA(-bstate
, 0, gone_back
- back
);
520 end_code
+= GET(end_code
, 1);
522 while (*end_code
== OP_ALT
);
525 /* This is the code for a "normal" subpattern (not a backward assertion). The
526 start of a whole pattern is always one of these. If we are at the top level,
527 we may be asked to restart matching from the same point that we reached for a
528 previous partial match. We still have to scan through the top-level branches to
529 find the end state. */
533 end_code
= this_start_code
;
537 if (rlevel
== 1 && (md
->moptions
& PCRE_DFA_RESTART
) != 0)
539 do { end_code
+= GET(end_code
, 1); } while (*end_code
== OP_ALT
);
540 new_count
= workspace
[1];
542 memcpy(new_states
, active_states
, new_count
* sizeof(stateblock
));
549 int length
= 1 + LINK_SIZE
+
550 ((*this_start_code
== OP_CBRA
|| *this_start_code
== OP_SCBRA
||
551 *this_start_code
== OP_CBRAPOS
|| *this_start_code
== OP_SCBRAPOS
)
555 ADD_NEW((int)(end_code
- start_code
+ length
), 0);
556 end_code
+= GET(end_code
, 1);
557 length
= 1 + LINK_SIZE
;
559 while (*end_code
== OP_ALT
);
563 workspace
[0] = 0; /* Bit indicating which vector is current */
565 DPRINTF(("%.*sEnd state = %d\n", rlevel
*2-2, SP
, (int)(end_code
- start_code
)));
567 /* Loop for scanning the subject */
569 ptr
= current_subject
;
576 BOOL partial_newline
= FALSE
;
577 BOOL could_continue
= reset_could_continue
;
578 reset_could_continue
= FALSE
;
580 /* Make the new state list into the active state list and empty the
583 temp_states
= active_states
;
584 active_states
= new_states
;
585 new_states
= temp_states
;
586 active_count
= new_count
;
589 workspace
[0] ^= 1; /* Remember for the restarting feature */
590 workspace
[1] = active_count
;
593 printf("%.*sNext character: rest of subject = \"", rlevel
*2-2, SP
);
594 pchars(ptr
, STRLEN_UC(ptr
), stdout
);
597 printf("%.*sActive states: ", rlevel
*2-2, SP
);
598 for (i
= 0; i
< active_count
; i
++)
599 printf("%d/%d ", active_states
[i
].offset
, active_states
[i
].count
);
603 /* Set the pointers for adding new states */
605 next_active_state
= active_states
+ active_count
;
606 next_new_state
= new_states
;
608 /* Load the current character from the subject outside the loop, as many
609 different states may want to look at it, and we assume that at least one
612 if (ptr
< end_subject
)
614 clen
= 1; /* Number of data items in the character */
616 if (utf
) { GETCHARLEN(c
, ptr
, clen
); } else
617 #endif /* SUPPORT_UTF */
622 clen
= 0; /* This indicates the end of the subject */
623 c
= NOTACHAR
; /* This value should never actually be used */
626 /* Scan up the active states and act on each one. The result of an action
627 may be to add more states to the currently active list (e.g. on hitting a
628 parenthesis) or it may be to put states on the new list, for considering
629 when we move the character pointer on. */
631 for (i
= 0; i
< active_count
; i
++)
633 stateblock
*current_state
= active_states
+ i
;
634 BOOL caseless
= FALSE
;
635 const pcre_uchar
*code
;
636 int state_offset
= current_state
->offset
;
637 int count
, codevalue
, rrc
;
640 printf ("%.*sProcessing state %d c=", rlevel
*2-2, SP
, state_offset
);
641 if (clen
== 0) printf("EOL\n");
642 else if (c
> 32 && c
< 127) printf("'%c'\n", c
);
643 else printf("0x%02x\n", c
);
646 /* A negative offset is a special case meaning "hold off going to this
647 (negated) state until the number of characters in the data field have
648 been skipped". If the could_continue flag was passed over from a previous
649 state, arrange for it to passed on. */
651 if (state_offset
< 0)
653 if (current_state
->data
> 0)
655 DPRINTF(("%.*sSkipping this character\n", rlevel
*2-2, SP
));
656 ADD_NEW_DATA(state_offset
, current_state
->count
,
657 current_state
->data
- 1);
658 if (could_continue
) reset_could_continue
= TRUE
;
663 current_state
->offset
= state_offset
= -state_offset
;
667 /* Check for a duplicate state with the same count, and skip if found.
668 See the note at the head of this module about the possibility of improving
671 for (j
= 0; j
< i
; j
++)
673 if (active_states
[j
].offset
== state_offset
&&
674 active_states
[j
].count
== current_state
->count
)
676 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel
*2-2, SP
));
677 goto NEXT_ACTIVE_STATE
;
681 /* The state offset is the offset to the opcode */
683 code
= start_code
+ state_offset
;
686 /* If this opcode inspects a character, but we are at the end of the
687 subject, remember the fact for use when testing for a partial match. */
689 if (clen
== 0 && poptable
[codevalue
] != 0)
690 could_continue
= TRUE
;
692 /* If this opcode is followed by an inline character, load it. It is
693 tempting to test for the presence of a subject character here, but that
694 is wrong, because sometimes zero repetitions of the subject are
697 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
698 argument that is not a data character - but is always one byte long because
699 the values are small. We have to take special action to deal with \P, \p,
700 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
701 these ones to new opcodes. */
703 if (coptable
[codevalue
] > 0)
707 if (utf
) { GETCHARLEN(d
, (code
+ coptable
[codevalue
]), dlen
); } else
708 #endif /* SUPPORT_UTF */
709 d
= code
[coptable
[codevalue
]];
710 if (codevalue
>= OP_TYPESTAR
)
714 case OP_ANYBYTE
: return PCRE_ERROR_DFA_UITEM
;
716 case OP_PROP
: codevalue
+= OP_PROP_EXTRA
; break;
717 case OP_ANYNL
: codevalue
+= OP_ANYNL_EXTRA
; break;
718 case OP_EXTUNI
: codevalue
+= OP_EXTUNI_EXTRA
; break;
720 case OP_HSPACE
: codevalue
+= OP_HSPACE_EXTRA
; break;
722 case OP_VSPACE
: codevalue
+= OP_VSPACE_EXTRA
; break;
729 dlen
= 0; /* Not strictly necessary, but compilers moan */
730 d
= NOTACHAR
; /* if these variables are not set. */
734 /* Now process the individual opcodes */
738 /* ========================================================================== */
739 /* These cases are never obeyed. This is a fudge that causes a compile-
740 time error if the vectors coptable or poptable, which are indexed by
741 opcode, are not the correct length. It seems to be the only way to do
742 such a check at compile time, as the sizeof() operator does not work
743 in the C preprocessor. */
745 case OP_TABLE_LENGTH
:
746 case OP_TABLE_LENGTH
+
747 ((sizeof(coptable
) == OP_TABLE_LENGTH
) &&
748 (sizeof(poptable
) == OP_TABLE_LENGTH
)):
751 /* ========================================================================== */
752 /* Reached a closing bracket. If not at the end of the pattern, carry
753 on with the next opcode. For repeating opcodes, also add the repeat
754 state. Note that KETRPOS will always be encountered at the end of the
755 subpattern, because the possessive subpattern repeats are always handled
756 using recursive calls. Thus, it never adds any new states.
758 At the end of the (sub)pattern, unless we have an empty string and
759 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760 start of the subject, save the match data, shifting up all previous
761 matches so we always have the longest first. */
767 if (code
!= end_code
)
769 ADD_ACTIVE(state_offset
+ 1 + LINK_SIZE
, 0);
770 if (codevalue
!= OP_KET
)
772 ADD_ACTIVE(state_offset
- GET(code
, 1), 0);
777 if (ptr
> current_subject
||
778 ((md
->moptions
& PCRE_NOTEMPTY
) == 0 &&
779 ((md
->moptions
& PCRE_NOTEMPTY_ATSTART
) == 0 ||
780 current_subject
> start_subject
+ md
->start_offset
)))
782 if (match_count
< 0) match_count
= (offsetcount
>= 2)? 1 : 0;
783 else if (match_count
> 0 && ++match_count
* 2 > offsetcount
)
785 count
= ((match_count
== 0)? offsetcount
: match_count
* 2) - 2;
786 if (count
> 0) memmove(offsets
+ 2, offsets
, count
* sizeof(int));
787 if (offsetcount
>= 2)
789 offsets
[0] = (int)(current_subject
- start_subject
);
790 offsets
[1] = (int)(ptr
- start_subject
);
791 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel
*2-2, SP
,
792 offsets
[1] - offsets
[0], (char *)current_subject
));
794 if ((md
->moptions
& PCRE_DFA_SHORTEST
) != 0)
796 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797 "%.*s---------------------\n\n", rlevel
*2-2, SP
, rlevel
,
798 match_count
, rlevel
*2-2, SP
));
805 /* ========================================================================== */
806 /* These opcodes add to the current list of states without looking
807 at the current character. */
809 /*-----------------------------------------------------------------*/
811 do { code
+= GET(code
, 1); } while (*code
== OP_ALT
);
812 ADD_ACTIVE((int)(code
- start_code
), 0);
815 /*-----------------------------------------------------------------*/
820 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
), 0);
821 code
+= GET(code
, 1);
823 while (*code
== OP_ALT
);
826 /*-----------------------------------------------------------------*/
829 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
+ IMM2_SIZE
), 0);
830 code
+= GET(code
, 1);
831 while (*code
== OP_ALT
)
833 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
), 0);
834 code
+= GET(code
, 1);
838 /*-----------------------------------------------------------------*/
841 ADD_ACTIVE(state_offset
+ 1, 0);
842 code
+= 1 + GET(code
, 2);
843 while (*code
== OP_ALT
) code
+= GET(code
, 1);
844 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
), 0);
847 /*-----------------------------------------------------------------*/
849 code
+= 1 + GET(code
, 2);
850 while (*code
== OP_ALT
) code
+= GET(code
, 1);
851 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
), 0);
854 /*-----------------------------------------------------------------*/
856 if (ptr
== start_subject
&& (md
->moptions
& PCRE_NOTBOL
) == 0)
857 { ADD_ACTIVE(state_offset
+ 1, 0); }
860 /*-----------------------------------------------------------------*/
862 if ((ptr
== start_subject
&& (md
->moptions
& PCRE_NOTBOL
) == 0) ||
863 (ptr
!= end_subject
&& WAS_NEWLINE(ptr
)))
864 { ADD_ACTIVE(state_offset
+ 1, 0); }
867 /*-----------------------------------------------------------------*/
869 if (ptr
>= end_subject
)
871 if ((md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
872 could_continue
= TRUE
;
873 else { ADD_ACTIVE(state_offset
+ 1, 0); }
877 /*-----------------------------------------------------------------*/
879 if (ptr
== start_subject
) { ADD_ACTIVE(state_offset
+ 1, 0); }
882 /*-----------------------------------------------------------------*/
884 if (ptr
== start_subject
+ start_offset
) { ADD_ACTIVE(state_offset
+ 1, 0); }
888 /* ========================================================================== */
889 /* These opcodes inspect the next subject character, and sometimes
890 the previous one as well, but do not have an argument. The variable
891 clen contains the length of the current character and is zero if we are
892 at the end of the subject. */
894 /*-----------------------------------------------------------------*/
896 if (clen
> 0 && !IS_NEWLINE(ptr
))
898 if (ptr
+ 1 >= md
->end_subject
&&
899 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
900 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
901 NLBLOCK
->nllen
== 2 &&
904 could_continue
= partial_newline
= TRUE
;
908 ADD_NEW(state_offset
+ 1, 0);
913 /*-----------------------------------------------------------------*/
916 { ADD_NEW(state_offset
+ 1, 0); }
919 /*-----------------------------------------------------------------*/
921 if (clen
== 0 && (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
922 could_continue
= TRUE
;
923 else if (clen
== 0 || (IS_NEWLINE(ptr
) && ptr
== end_subject
- md
->nllen
))
924 { ADD_ACTIVE(state_offset
+ 1, 0); }
927 /*-----------------------------------------------------------------*/
929 if ((md
->moptions
& PCRE_NOTEOL
) == 0)
931 if (clen
== 0 && (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
932 could_continue
= TRUE
;
933 else if (clen
== 0 ||
934 ((md
->poptions
& PCRE_DOLLAR_ENDONLY
) == 0 && IS_NEWLINE(ptr
) &&
935 (ptr
== end_subject
- md
->nllen
)
937 { ADD_ACTIVE(state_offset
+ 1, 0); }
938 else if (ptr
+ 1 >= md
->end_subject
&&
939 (md
->moptions
& (PCRE_PARTIAL_HARD
|PCRE_PARTIAL_SOFT
)) != 0 &&
940 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
941 NLBLOCK
->nllen
== 2 &&
944 if ((md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
946 reset_could_continue
= TRUE
;
947 ADD_NEW_DATA(-(state_offset
+ 1), 0, 1);
949 else could_continue
= partial_newline
= TRUE
;
954 /*-----------------------------------------------------------------*/
956 if ((md
->moptions
& PCRE_NOTEOL
) == 0)
958 if (clen
== 0 && (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
959 could_continue
= TRUE
;
960 else if (clen
== 0 ||
961 ((md
->poptions
& PCRE_DOLLAR_ENDONLY
) == 0 && IS_NEWLINE(ptr
)))
962 { ADD_ACTIVE(state_offset
+ 1, 0); }
963 else if (ptr
+ 1 >= md
->end_subject
&&
964 (md
->moptions
& (PCRE_PARTIAL_HARD
|PCRE_PARTIAL_SOFT
)) != 0 &&
965 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
966 NLBLOCK
->nllen
== 2 &&
969 if ((md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
971 reset_could_continue
= TRUE
;
972 ADD_NEW_DATA(-(state_offset
+ 1), 0, 1);
974 else could_continue
= partial_newline
= TRUE
;
977 else if (IS_NEWLINE(ptr
))
978 { ADD_ACTIVE(state_offset
+ 1, 0); }
981 /*-----------------------------------------------------------------*/
986 if (clen
> 0 && c
< 256 &&
987 ((ctypes
[c
] & toptable1
[codevalue
]) ^ toptable2
[codevalue
]) != 0)
988 { ADD_NEW(state_offset
+ 1, 0); }
991 /*-----------------------------------------------------------------*/
993 case OP_NOT_WHITESPACE
:
994 case OP_NOT_WORDCHAR
:
995 if (clen
> 0 && (c
>= 256 ||
996 ((ctypes
[c
] & toptable1
[codevalue
]) ^ toptable2
[codevalue
]) != 0))
997 { ADD_NEW(state_offset
+ 1, 0); }
1000 /*-----------------------------------------------------------------*/
1001 case OP_WORD_BOUNDARY
:
1002 case OP_NOT_WORD_BOUNDARY
:
1004 int left_word
, right_word
;
1006 if (ptr
> start_subject
)
1008 const pcre_uchar
*temp
= ptr
- 1;
1009 if (temp
< md
->start_used_ptr
) md
->start_used_ptr
= temp
;
1011 if (utf
) { BACKCHAR(temp
); }
1013 GETCHARTEST(d
, temp
);
1015 if ((md
->poptions
& PCRE_UCP
) != 0)
1017 if (d
== '_') left_word
= TRUE
; else
1019 int cat
= UCD_CATEGORY(d
);
1020 left_word
= (cat
== ucp_L
|| cat
== ucp_N
);
1025 left_word
= d
< 256 && (ctypes
[d
] & ctype_word
) != 0;
1027 else left_word
= FALSE
;
1032 if ((md
->poptions
& PCRE_UCP
) != 0)
1034 if (c
== '_') right_word
= TRUE
; else
1036 int cat
= UCD_CATEGORY(c
);
1037 right_word
= (cat
== ucp_L
|| cat
== ucp_N
);
1042 right_word
= c
< 256 && (ctypes
[c
] & ctype_word
) != 0;
1044 else right_word
= FALSE
;
1046 if ((left_word
== right_word
) == (codevalue
== OP_NOT_WORD_BOUNDARY
))
1047 { ADD_ACTIVE(state_offset
+ 1, 0); }
1052 /*-----------------------------------------------------------------*/
1053 /* Check the next character by Unicode property. We will get here only
1054 if the support is in the binary; otherwise a compile-time error occurs.
1063 const pcre_uint8 chartype
= UCD_CHARTYPE(c
);
1071 OK
= chartype
== ucp_Lu
|| chartype
== ucp_Ll
||
1076 OK
= PRIV(ucp_gentype
)[chartype
] == code
[2];
1080 OK
= chartype
== code
[2];
1084 OK
= UCD_SCRIPT(c
) == code
[2];
1087 /* These are specials for combination cases. */
1090 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1091 PRIV(ucp_gentype
)[chartype
] == ucp_N
;
1094 case PT_SPACE
: /* Perl space */
1095 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1096 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
;
1099 case PT_PXSPACE
: /* POSIX space */
1100 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1101 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
1102 c
== CHAR_FF
|| c
== CHAR_CR
;
1106 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1107 PRIV(ucp_gentype
)[chartype
] == ucp_N
||
1108 c
== CHAR_UNDERSCORE
;
1111 /* Should never occur, but keep compilers from grumbling. */
1114 OK
= codevalue
!= OP_PROP
;
1118 if (OK
== (codevalue
== OP_PROP
)) { ADD_NEW(state_offset
+ 3, 0); }
1125 /* ========================================================================== */
1126 /* These opcodes likewise inspect the subject character, but have an
1127 argument that is not a data character. It is one of these opcodes:
1128 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1132 case OP_TYPEMINPLUS
:
1133 case OP_TYPEPOSPLUS
:
1134 count
= current_state
->count
; /* Already matched */
1135 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1138 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1139 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1140 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1141 NLBLOCK
->nllen
== 2 &&
1142 c
== NLBLOCK
->nl
[0])
1144 could_continue
= partial_newline
= TRUE
;
1146 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1148 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1149 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1151 if (count
> 0 && codevalue
== OP_TYPEPOSPLUS
)
1153 active_count
--; /* Remove non-match possibility */
1154 next_active_state
--;
1157 ADD_NEW(state_offset
, count
);
1162 /*-----------------------------------------------------------------*/
1164 case OP_TYPEMINQUERY
:
1165 case OP_TYPEPOSQUERY
:
1166 ADD_ACTIVE(state_offset
+ 2, 0);
1169 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1170 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1171 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1172 NLBLOCK
->nllen
== 2 &&
1173 c
== NLBLOCK
->nl
[0])
1175 could_continue
= partial_newline
= TRUE
;
1177 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1179 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1180 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1182 if (codevalue
== OP_TYPEPOSQUERY
)
1184 active_count
--; /* Remove non-match possibility */
1185 next_active_state
--;
1187 ADD_NEW(state_offset
+ 2, 0);
1192 /*-----------------------------------------------------------------*/
1194 case OP_TYPEMINSTAR
:
1195 case OP_TYPEPOSSTAR
:
1196 ADD_ACTIVE(state_offset
+ 2, 0);
1199 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1200 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1201 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1202 NLBLOCK
->nllen
== 2 &&
1203 c
== NLBLOCK
->nl
[0])
1205 could_continue
= partial_newline
= TRUE
;
1207 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1209 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1210 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1212 if (codevalue
== OP_TYPEPOSSTAR
)
1214 active_count
--; /* Remove non-match possibility */
1215 next_active_state
--;
1217 ADD_NEW(state_offset
, 0);
1222 /*-----------------------------------------------------------------*/
1224 count
= current_state
->count
; /* Number already matched */
1227 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1228 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1229 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1230 NLBLOCK
->nllen
== 2 &&
1231 c
== NLBLOCK
->nl
[0])
1233 could_continue
= partial_newline
= TRUE
;
1235 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1237 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1238 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1240 if (++count
>= GET2(code
, 1))
1241 { ADD_NEW(state_offset
+ 1 + IMM2_SIZE
+ 1, 0); }
1243 { ADD_NEW(state_offset
, count
); }
1248 /*-----------------------------------------------------------------*/
1250 case OP_TYPEMINUPTO
:
1251 case OP_TYPEPOSUPTO
:
1252 ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0);
1253 count
= current_state
->count
; /* Number already matched */
1256 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1257 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1258 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1259 NLBLOCK
->nllen
== 2 &&
1260 c
== NLBLOCK
->nl
[0])
1262 could_continue
= partial_newline
= TRUE
;
1264 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1266 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1267 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1269 if (codevalue
== OP_TYPEPOSUPTO
)
1271 active_count
--; /* Remove non-match possibility */
1272 next_active_state
--;
1274 if (++count
>= GET2(code
, 1))
1275 { ADD_NEW(state_offset
+ 2 + IMM2_SIZE
, 0); }
1277 { ADD_NEW(state_offset
, count
); }
1282 /* ========================================================================== */
1283 /* These are virtual opcodes that are used when something like
1284 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1285 argument. It keeps the code above fast for the other cases. The argument
1286 is in the d variable. */
1289 case OP_PROP_EXTRA
+ OP_TYPEPLUS
:
1290 case OP_PROP_EXTRA
+ OP_TYPEMINPLUS
:
1291 case OP_PROP_EXTRA
+ OP_TYPEPOSPLUS
:
1292 count
= current_state
->count
; /* Already matched */
1293 if (count
> 0) { ADD_ACTIVE(state_offset
+ 4, 0); }
1297 const pcre_uint8 chartype
= UCD_CHARTYPE(c
);
1305 OK
= chartype
== ucp_Lu
|| chartype
== ucp_Ll
||
1310 OK
= PRIV(ucp_gentype
)[chartype
] == code
[3];
1314 OK
= chartype
== code
[3];
1318 OK
= UCD_SCRIPT(c
) == code
[3];
1321 /* These are specials for combination cases. */
1324 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1325 PRIV(ucp_gentype
)[chartype
] == ucp_N
;
1328 case PT_SPACE
: /* Perl space */
1329 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1330 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
;
1333 case PT_PXSPACE
: /* POSIX space */
1334 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1335 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
1336 c
== CHAR_FF
|| c
== CHAR_CR
;
1340 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1341 PRIV(ucp_gentype
)[chartype
] == ucp_N
||
1342 c
== CHAR_UNDERSCORE
;
1345 /* Should never occur, but keep compilers from grumbling. */
1348 OK
= codevalue
!= OP_PROP
;
1352 if (OK
== (d
== OP_PROP
))
1354 if (count
> 0 && codevalue
== OP_PROP_EXTRA
+ OP_TYPEPOSPLUS
)
1356 active_count
--; /* Remove non-match possibility */
1357 next_active_state
--;
1360 ADD_NEW(state_offset
, count
);
1365 /*-----------------------------------------------------------------*/
1366 case OP_EXTUNI_EXTRA
+ OP_TYPEPLUS
:
1367 case OP_EXTUNI_EXTRA
+ OP_TYPEMINPLUS
:
1368 case OP_EXTUNI_EXTRA
+ OP_TYPEPOSPLUS
:
1369 count
= current_state
->count
; /* Already matched */
1370 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1371 if (clen
> 0 && UCD_CATEGORY(c
) != ucp_M
)
1373 const pcre_uchar
*nptr
= ptr
+ clen
;
1375 if (count
> 0 && codevalue
== OP_EXTUNI_EXTRA
+ OP_TYPEPOSPLUS
)
1377 active_count
--; /* Remove non-match possibility */
1378 next_active_state
--;
1380 while (nptr
< end_subject
)
1384 GETCHARLEN(nd
, nptr
, ndlen
);
1385 if (UCD_CATEGORY(nd
) != ucp_M
) break;
1390 ADD_NEW_DATA(-state_offset
, count
, ncount
);
1395 /*-----------------------------------------------------------------*/
1396 case OP_ANYNL_EXTRA
+ OP_TYPEPLUS
:
1397 case OP_ANYNL_EXTRA
+ OP_TYPEMINPLUS
:
1398 case OP_ANYNL_EXTRA
+ OP_TYPEPOSPLUS
:
1399 count
= current_state
->count
; /* Already matched */
1400 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1411 if ((md
->moptions
& PCRE_BSR_ANYCRLF
) != 0) break;
1415 if (ptr
+ 1 < end_subject
&& ptr
[1] == 0x0a) ncount
= 1;
1420 if (count
> 0 && codevalue
== OP_ANYNL_EXTRA
+ OP_TYPEPOSPLUS
)
1422 active_count
--; /* Remove non-match possibility */
1423 next_active_state
--;
1426 ADD_NEW_DATA(-state_offset
, count
, ncount
);
1435 /*-----------------------------------------------------------------*/
1436 case OP_VSPACE_EXTRA
+ OP_TYPEPLUS
:
1437 case OP_VSPACE_EXTRA
+ OP_TYPEMINPLUS
:
1438 case OP_VSPACE_EXTRA
+ OP_TYPEPOSPLUS
:
1439 count
= current_state
->count
; /* Already matched */
1440 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1461 if (OK
== (d
== OP_VSPACE
))
1463 if (count
> 0 && codevalue
== OP_VSPACE_EXTRA
+ OP_TYPEPOSPLUS
)
1465 active_count
--; /* Remove non-match possibility */
1466 next_active_state
--;
1469 ADD_NEW_DATA(-state_offset
, count
, 0);
1474 /*-----------------------------------------------------------------*/
1475 case OP_HSPACE_EXTRA
+ OP_TYPEPLUS
:
1476 case OP_HSPACE_EXTRA
+ OP_TYPEMINPLUS
:
1477 case OP_HSPACE_EXTRA
+ OP_TYPEPOSPLUS
:
1478 count
= current_state
->count
; /* Already matched */
1479 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1486 case 0x20: /* SPACE */
1487 case 0xa0: /* NBSP */
1488 case 0x1680: /* OGHAM SPACE MARK */
1489 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1490 case 0x2000: /* EN QUAD */
1491 case 0x2001: /* EM QUAD */
1492 case 0x2002: /* EN SPACE */
1493 case 0x2003: /* EM SPACE */
1494 case 0x2004: /* THREE-PER-EM SPACE */
1495 case 0x2005: /* FOUR-PER-EM SPACE */
1496 case 0x2006: /* SIX-PER-EM SPACE */
1497 case 0x2007: /* FIGURE SPACE */
1498 case 0x2008: /* PUNCTUATION SPACE */
1499 case 0x2009: /* THIN SPACE */
1500 case 0x200A: /* HAIR SPACE */
1501 case 0x202f: /* NARROW NO-BREAK SPACE */
1502 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1503 case 0x3000: /* IDEOGRAPHIC SPACE */
1512 if (OK
== (d
== OP_HSPACE
))
1514 if (count
> 0 && codevalue
== OP_HSPACE_EXTRA
+ OP_TYPEPOSPLUS
)
1516 active_count
--; /* Remove non-match possibility */
1517 next_active_state
--;
1520 ADD_NEW_DATA(-state_offset
, count
, 0);
1525 /*-----------------------------------------------------------------*/
1527 case OP_PROP_EXTRA
+ OP_TYPEQUERY
:
1528 case OP_PROP_EXTRA
+ OP_TYPEMINQUERY
:
1529 case OP_PROP_EXTRA
+ OP_TYPEPOSQUERY
:
1533 case OP_PROP_EXTRA
+ OP_TYPESTAR
:
1534 case OP_PROP_EXTRA
+ OP_TYPEMINSTAR
:
1535 case OP_PROP_EXTRA
+ OP_TYPEPOSSTAR
:
1540 ADD_ACTIVE(state_offset
+ 4, 0);
1544 const pcre_uint8 chartype
= UCD_CHARTYPE(c
);
1552 OK
= chartype
== ucp_Lu
|| chartype
== ucp_Ll
||
1557 OK
= PRIV(ucp_gentype
)[chartype
] == code
[3];
1561 OK
= chartype
== code
[3];
1565 OK
= UCD_SCRIPT(c
) == code
[3];
1568 /* These are specials for combination cases. */
1571 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1572 PRIV(ucp_gentype
)[chartype
] == ucp_N
;
1575 case PT_SPACE
: /* Perl space */
1576 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1577 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
;
1580 case PT_PXSPACE
: /* POSIX space */
1581 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1582 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
1583 c
== CHAR_FF
|| c
== CHAR_CR
;
1587 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1588 PRIV(ucp_gentype
)[chartype
] == ucp_N
||
1589 c
== CHAR_UNDERSCORE
;
1592 /* Should never occur, but keep compilers from grumbling. */
1595 OK
= codevalue
!= OP_PROP
;
1599 if (OK
== (d
== OP_PROP
))
1601 if (codevalue
== OP_PROP_EXTRA
+ OP_TYPEPOSSTAR
||
1602 codevalue
== OP_PROP_EXTRA
+ OP_TYPEPOSQUERY
)
1604 active_count
--; /* Remove non-match possibility */
1605 next_active_state
--;
1607 ADD_NEW(state_offset
+ count
, 0);
1612 /*-----------------------------------------------------------------*/
1613 case OP_EXTUNI_EXTRA
+ OP_TYPEQUERY
:
1614 case OP_EXTUNI_EXTRA
+ OP_TYPEMINQUERY
:
1615 case OP_EXTUNI_EXTRA
+ OP_TYPEPOSQUERY
:
1619 case OP_EXTUNI_EXTRA
+ OP_TYPESTAR
:
1620 case OP_EXTUNI_EXTRA
+ OP_TYPEMINSTAR
:
1621 case OP_EXTUNI_EXTRA
+ OP_TYPEPOSSTAR
:
1626 ADD_ACTIVE(state_offset
+ 2, 0);
1627 if (clen
> 0 && UCD_CATEGORY(c
) != ucp_M
)
1629 const pcre_uchar
*nptr
= ptr
+ clen
;
1631 if (codevalue
== OP_EXTUNI_EXTRA
+ OP_TYPEPOSSTAR
||
1632 codevalue
== OP_EXTUNI_EXTRA
+ OP_TYPEPOSQUERY
)
1634 active_count
--; /* Remove non-match possibility */
1635 next_active_state
--;
1637 while (nptr
< end_subject
)
1641 GETCHARLEN(nd
, nptr
, ndlen
);
1642 if (UCD_CATEGORY(nd
) != ucp_M
) break;
1646 ADD_NEW_DATA(-(state_offset
+ count
), 0, ncount
);
1651 /*-----------------------------------------------------------------*/
1652 case OP_ANYNL_EXTRA
+ OP_TYPEQUERY
:
1653 case OP_ANYNL_EXTRA
+ OP_TYPEMINQUERY
:
1654 case OP_ANYNL_EXTRA
+ OP_TYPEPOSQUERY
:
1658 case OP_ANYNL_EXTRA
+ OP_TYPESTAR
:
1659 case OP_ANYNL_EXTRA
+ OP_TYPEMINSTAR
:
1660 case OP_ANYNL_EXTRA
+ OP_TYPEPOSSTAR
:
1664 ADD_ACTIVE(state_offset
+ 2, 0);
1675 if ((md
->moptions
& PCRE_BSR_ANYCRLF
) != 0) break;
1679 if (ptr
+ 1 < end_subject
&& ptr
[1] == 0x0a) ncount
= 1;
1684 if (codevalue
== OP_ANYNL_EXTRA
+ OP_TYPEPOSSTAR
||
1685 codevalue
== OP_ANYNL_EXTRA
+ OP_TYPEPOSQUERY
)
1687 active_count
--; /* Remove non-match possibility */
1688 next_active_state
--;
1690 ADD_NEW_DATA(-(state_offset
+ count
), 0, ncount
);
1699 /*-----------------------------------------------------------------*/
1700 case OP_VSPACE_EXTRA
+ OP_TYPEQUERY
:
1701 case OP_VSPACE_EXTRA
+ OP_TYPEMINQUERY
:
1702 case OP_VSPACE_EXTRA
+ OP_TYPEPOSQUERY
:
1706 case OP_VSPACE_EXTRA
+ OP_TYPESTAR
:
1707 case OP_VSPACE_EXTRA
+ OP_TYPEMINSTAR
:
1708 case OP_VSPACE_EXTRA
+ OP_TYPEPOSSTAR
:
1712 ADD_ACTIVE(state_offset
+ 2, 0);
1732 if (OK
== (d
== OP_VSPACE
))
1734 if (codevalue
== OP_VSPACE_EXTRA
+ OP_TYPEPOSSTAR
||
1735 codevalue
== OP_VSPACE_EXTRA
+ OP_TYPEPOSQUERY
)
1737 active_count
--; /* Remove non-match possibility */
1738 next_active_state
--;
1740 ADD_NEW_DATA(-(state_offset
+ count
), 0, 0);
1745 /*-----------------------------------------------------------------*/
1746 case OP_HSPACE_EXTRA
+ OP_TYPEQUERY
:
1747 case OP_HSPACE_EXTRA
+ OP_TYPEMINQUERY
:
1748 case OP_HSPACE_EXTRA
+ OP_TYPEPOSQUERY
:
1752 case OP_HSPACE_EXTRA
+ OP_TYPESTAR
:
1753 case OP_HSPACE_EXTRA
+ OP_TYPEMINSTAR
:
1754 case OP_HSPACE_EXTRA
+ OP_TYPEPOSSTAR
:
1758 ADD_ACTIVE(state_offset
+ 2, 0);
1765 case 0x20: /* SPACE */
1766 case 0xa0: /* NBSP */
1767 case 0x1680: /* OGHAM SPACE MARK */
1768 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1769 case 0x2000: /* EN QUAD */
1770 case 0x2001: /* EM QUAD */
1771 case 0x2002: /* EN SPACE */
1772 case 0x2003: /* EM SPACE */
1773 case 0x2004: /* THREE-PER-EM SPACE */
1774 case 0x2005: /* FOUR-PER-EM SPACE */
1775 case 0x2006: /* SIX-PER-EM SPACE */
1776 case 0x2007: /* FIGURE SPACE */
1777 case 0x2008: /* PUNCTUATION SPACE */
1778 case 0x2009: /* THIN SPACE */
1779 case 0x200A: /* HAIR SPACE */
1780 case 0x202f: /* NARROW NO-BREAK SPACE */
1781 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1782 case 0x3000: /* IDEOGRAPHIC SPACE */
1791 if (OK
== (d
== OP_HSPACE
))
1793 if (codevalue
== OP_HSPACE_EXTRA
+ OP_TYPEPOSSTAR
||
1794 codevalue
== OP_HSPACE_EXTRA
+ OP_TYPEPOSQUERY
)
1796 active_count
--; /* Remove non-match possibility */
1797 next_active_state
--;
1799 ADD_NEW_DATA(-(state_offset
+ count
), 0, 0);
1804 /*-----------------------------------------------------------------*/
1806 case OP_PROP_EXTRA
+ OP_TYPEEXACT
:
1807 case OP_PROP_EXTRA
+ OP_TYPEUPTO
:
1808 case OP_PROP_EXTRA
+ OP_TYPEMINUPTO
:
1809 case OP_PROP_EXTRA
+ OP_TYPEPOSUPTO
:
1810 if (codevalue
!= OP_PROP_EXTRA
+ OP_TYPEEXACT
)
1811 { ADD_ACTIVE(state_offset
+ 1 + IMM2_SIZE
+ 3, 0); }
1812 count
= current_state
->count
; /* Number already matched */
1816 const pcre_uint8 chartype
= UCD_CHARTYPE(c
);
1817 switch(code
[1 + IMM2_SIZE
+ 1])
1824 OK
= chartype
== ucp_Lu
|| chartype
== ucp_Ll
||
1829 OK
= PRIV(ucp_gentype
)[chartype
] == code
[1 + IMM2_SIZE
+ 2];
1833 OK
= chartype
== code
[1 + IMM2_SIZE
+ 2];
1837 OK
= UCD_SCRIPT(c
) == code
[1 + IMM2_SIZE
+ 2];
1840 /* These are specials for combination cases. */
1843 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1844 PRIV(ucp_gentype
)[chartype
] == ucp_N
;
1847 case PT_SPACE
: /* Perl space */
1848 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1849 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
;
1852 case PT_PXSPACE
: /* POSIX space */
1853 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1854 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
1855 c
== CHAR_FF
|| c
== CHAR_CR
;
1859 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1860 PRIV(ucp_gentype
)[chartype
] == ucp_N
||
1861 c
== CHAR_UNDERSCORE
;
1864 /* Should never occur, but keep compilers from grumbling. */
1867 OK
= codevalue
!= OP_PROP
;
1871 if (OK
== (d
== OP_PROP
))
1873 if (codevalue
== OP_PROP_EXTRA
+ OP_TYPEPOSUPTO
)
1875 active_count
--; /* Remove non-match possibility */
1876 next_active_state
--;
1878 if (++count
>= GET2(code
, 1))
1879 { ADD_NEW(state_offset
+ 1 + IMM2_SIZE
+ 3, 0); }
1881 { ADD_NEW(state_offset
, count
); }
1886 /*-----------------------------------------------------------------*/
1887 case OP_EXTUNI_EXTRA
+ OP_TYPEEXACT
:
1888 case OP_EXTUNI_EXTRA
+ OP_TYPEUPTO
:
1889 case OP_EXTUNI_EXTRA
+ OP_TYPEMINUPTO
:
1890 case OP_EXTUNI_EXTRA
+ OP_TYPEPOSUPTO
:
1891 if (codevalue
!= OP_EXTUNI_EXTRA
+ OP_TYPEEXACT
)
1892 { ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0); }
1893 count
= current_state
->count
; /* Number already matched */
1894 if (clen
> 0 && UCD_CATEGORY(c
) != ucp_M
)
1896 const pcre_uchar
*nptr
= ptr
+ clen
;
1898 if (codevalue
== OP_EXTUNI_EXTRA
+ OP_TYPEPOSUPTO
)
1900 active_count
--; /* Remove non-match possibility */
1901 next_active_state
--;
1903 while (nptr
< end_subject
)
1907 GETCHARLEN(nd
, nptr
, ndlen
);
1908 if (UCD_CATEGORY(nd
) != ucp_M
) break;
1912 if (nptr
>= end_subject
&& (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
1913 reset_could_continue
= TRUE
;
1914 if (++count
>= GET2(code
, 1))
1915 { ADD_NEW_DATA(-(state_offset
+ 2 + IMM2_SIZE
), 0, ncount
); }
1917 { ADD_NEW_DATA(-state_offset
, count
, ncount
); }
1922 /*-----------------------------------------------------------------*/
1923 case OP_ANYNL_EXTRA
+ OP_TYPEEXACT
:
1924 case OP_ANYNL_EXTRA
+ OP_TYPEUPTO
:
1925 case OP_ANYNL_EXTRA
+ OP_TYPEMINUPTO
:
1926 case OP_ANYNL_EXTRA
+ OP_TYPEPOSUPTO
:
1927 if (codevalue
!= OP_ANYNL_EXTRA
+ OP_TYPEEXACT
)
1928 { ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0); }
1929 count
= current_state
->count
; /* Number already matched */
1940 if ((md
->moptions
& PCRE_BSR_ANYCRLF
) != 0) break;
1944 if (ptr
+ 1 < end_subject
&& ptr
[1] == 0x0a) ncount
= 1;
1949 if (codevalue
== OP_ANYNL_EXTRA
+ OP_TYPEPOSUPTO
)
1951 active_count
--; /* Remove non-match possibility */
1952 next_active_state
--;
1954 if (++count
>= GET2(code
, 1))
1955 { ADD_NEW_DATA(-(state_offset
+ 2 + IMM2_SIZE
), 0, ncount
); }
1957 { ADD_NEW_DATA(-state_offset
, count
, ncount
); }
1966 /*-----------------------------------------------------------------*/
1967 case OP_VSPACE_EXTRA
+ OP_TYPEEXACT
:
1968 case OP_VSPACE_EXTRA
+ OP_TYPEUPTO
:
1969 case OP_VSPACE_EXTRA
+ OP_TYPEMINUPTO
:
1970 case OP_VSPACE_EXTRA
+ OP_TYPEPOSUPTO
:
1971 if (codevalue
!= OP_VSPACE_EXTRA
+ OP_TYPEEXACT
)
1972 { ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0); }
1973 count
= current_state
->count
; /* Number already matched */
1993 if (OK
== (d
== OP_VSPACE
))
1995 if (codevalue
== OP_VSPACE_EXTRA
+ OP_TYPEPOSUPTO
)
1997 active_count
--; /* Remove non-match possibility */
1998 next_active_state
--;
2000 if (++count
>= GET2(code
, 1))
2001 { ADD_NEW_DATA(-(state_offset
+ 2 + IMM2_SIZE
), 0, 0); }
2003 { ADD_NEW_DATA(-state_offset
, count
, 0); }
2008 /*-----------------------------------------------------------------*/
2009 case OP_HSPACE_EXTRA
+ OP_TYPEEXACT
:
2010 case OP_HSPACE_EXTRA
+ OP_TYPEUPTO
:
2011 case OP_HSPACE_EXTRA
+ OP_TYPEMINUPTO
:
2012 case OP_HSPACE_EXTRA
+ OP_TYPEPOSUPTO
:
2013 if (codevalue
!= OP_HSPACE_EXTRA
+ OP_TYPEEXACT
)
2014 { ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0); }
2015 count
= current_state
->count
; /* Number already matched */
2022 case 0x20: /* SPACE */
2023 case 0xa0: /* NBSP */
2024 case 0x1680: /* OGHAM SPACE MARK */
2025 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2026 case 0x2000: /* EN QUAD */
2027 case 0x2001: /* EM QUAD */
2028 case 0x2002: /* EN SPACE */
2029 case 0x2003: /* EM SPACE */
2030 case 0x2004: /* THREE-PER-EM SPACE */
2031 case 0x2005: /* FOUR-PER-EM SPACE */
2032 case 0x2006: /* SIX-PER-EM SPACE */
2033 case 0x2007: /* FIGURE SPACE */
2034 case 0x2008: /* PUNCTUATION SPACE */
2035 case 0x2009: /* THIN SPACE */
2036 case 0x200A: /* HAIR SPACE */
2037 case 0x202f: /* NARROW NO-BREAK SPACE */
2038 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2039 case 0x3000: /* IDEOGRAPHIC SPACE */
2048 if (OK
== (d
== OP_HSPACE
))
2050 if (codevalue
== OP_HSPACE_EXTRA
+ OP_TYPEPOSUPTO
)
2052 active_count
--; /* Remove non-match possibility */
2053 next_active_state
--;
2055 if (++count
>= GET2(code
, 1))
2056 { ADD_NEW_DATA(-(state_offset
+ 2 + IMM2_SIZE
), 0, 0); }
2058 { ADD_NEW_DATA(-state_offset
, count
, 0); }
2063 /* ========================================================================== */
2064 /* These opcodes are followed by a character that is usually compared
2065 to the current subject character; it is loaded into d. We still get
2066 here even if there is no subject character, because in some cases zero
2067 repetitions are permitted. */
2069 /*-----------------------------------------------------------------*/
2071 if (clen
> 0 && c
== d
) { ADD_NEW(state_offset
+ dlen
+ 1, 0); }
2074 /*-----------------------------------------------------------------*/
2076 if (clen
== 0) break;
2081 if (c
== d
) { ADD_NEW(state_offset
+ dlen
+ 1, 0); } else
2083 unsigned int othercase
;
2087 /* If we have Unicode property support, we can use it to test the
2088 other case of the character. */
2090 othercase
= UCD_OTHERCASE(c
);
2092 othercase
= NOTACHAR
;
2095 if (d
== othercase
) { ADD_NEW(state_offset
+ dlen
+ 1, 0); }
2099 #endif /* SUPPORT_UTF */
2102 if (TABLE_GET(c
, lcc
, c
) == TABLE_GET(d
, lcc
, d
))
2103 { ADD_NEW(state_offset
+ 2, 0); }
2109 /*-----------------------------------------------------------------*/
2110 /* This is a tricky one because it can match more than one character.
2111 Find out how many characters to skip, and then set up a negative state
2112 to wait for them to pass before continuing. */
2115 if (clen
> 0 && UCD_CATEGORY(c
) != ucp_M
)
2117 const pcre_uchar
*nptr
= ptr
+ clen
;
2119 while (nptr
< end_subject
)
2122 GETCHARLEN(c
, nptr
, nclen
);
2123 if (UCD_CATEGORY(c
) != ucp_M
) break;
2127 if (nptr
>= end_subject
&& (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
2128 reset_could_continue
= TRUE
;
2129 ADD_NEW_DATA(-(state_offset
+ 1), 0, ncount
);
2134 /*-----------------------------------------------------------------*/
2135 /* This is a tricky like EXTUNI because it too can match more than one
2136 character (when CR is followed by LF). In this case, set up a negative
2137 state to wait for one character to pass before continuing. */
2140 if (clen
> 0) switch(c
)
2147 if ((md
->moptions
& PCRE_BSR_ANYCRLF
) != 0) break;
2150 ADD_NEW(state_offset
+ 1, 0);
2154 if (ptr
+ 1 >= end_subject
)
2156 ADD_NEW(state_offset
+ 1, 0);
2157 if ((md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
2158 reset_could_continue
= TRUE
;
2160 else if (ptr
[1] == 0x0a)
2162 ADD_NEW_DATA(-(state_offset
+ 1), 0, 1);
2166 ADD_NEW(state_offset
+ 1, 0);
2172 /*-----------------------------------------------------------------*/
2174 if (clen
> 0) switch(c
)
2186 ADD_NEW(state_offset
+ 1, 0);
2191 /*-----------------------------------------------------------------*/
2193 if (clen
> 0) switch(c
)
2202 ADD_NEW(state_offset
+ 1, 0);
2209 /*-----------------------------------------------------------------*/
2211 if (clen
> 0) switch(c
)
2214 case 0x20: /* SPACE */
2215 case 0xa0: /* NBSP */
2216 case 0x1680: /* OGHAM SPACE MARK */
2217 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2218 case 0x2000: /* EN QUAD */
2219 case 0x2001: /* EM QUAD */
2220 case 0x2002: /* EN SPACE */
2221 case 0x2003: /* EM SPACE */
2222 case 0x2004: /* THREE-PER-EM SPACE */
2223 case 0x2005: /* FOUR-PER-EM SPACE */
2224 case 0x2006: /* SIX-PER-EM SPACE */
2225 case 0x2007: /* FIGURE SPACE */
2226 case 0x2008: /* PUNCTUATION SPACE */
2227 case 0x2009: /* THIN SPACE */
2228 case 0x200A: /* HAIR SPACE */
2229 case 0x202f: /* NARROW NO-BREAK SPACE */
2230 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2231 case 0x3000: /* IDEOGRAPHIC SPACE */
2235 ADD_NEW(state_offset
+ 1, 0);
2240 /*-----------------------------------------------------------------*/
2242 if (clen
> 0) switch(c
)
2245 case 0x20: /* SPACE */
2246 case 0xa0: /* NBSP */
2247 case 0x1680: /* OGHAM SPACE MARK */
2248 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2249 case 0x2000: /* EN QUAD */
2250 case 0x2001: /* EM QUAD */
2251 case 0x2002: /* EN SPACE */
2252 case 0x2003: /* EM SPACE */
2253 case 0x2004: /* THREE-PER-EM SPACE */
2254 case 0x2005: /* FOUR-PER-EM SPACE */
2255 case 0x2006: /* SIX-PER-EM SPACE */
2256 case 0x2007: /* FIGURE SPACE */
2257 case 0x2008: /* PUNCTUATION SPACE */
2258 case 0x2009: /* THIN SPACE */
2259 case 0x200A: /* HAIR SPACE */
2260 case 0x202f: /* NARROW NO-BREAK SPACE */
2261 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2262 case 0x3000: /* IDEOGRAPHIC SPACE */
2263 ADD_NEW(state_offset
+ 1, 0);
2268 /*-----------------------------------------------------------------*/
2269 /* Match a negated single character casefully. */
2272 if (clen
> 0 && c
!= d
) { ADD_NEW(state_offset
+ dlen
+ 1, 0); }
2275 /*-----------------------------------------------------------------*/
2276 /* Match a negated single character caselessly. */
2281 unsigned int otherd
;
2283 if (utf
&& d
>= 128)
2286 otherd
= UCD_OTHERCASE(d
);
2287 #endif /* SUPPORT_UCP */
2290 #endif /* SUPPORT_UTF */
2291 otherd
= TABLE_GET(d
, fcc
, d
);
2292 if (c
!= d
&& c
!= otherd
)
2293 { ADD_NEW(state_offset
+ dlen
+ 1, 0); }
2297 /*-----------------------------------------------------------------*/
2302 case OP_NOTMINPLUSI
:
2303 case OP_NOTPOSPLUSI
:
2305 codevalue
-= OP_STARI
- OP_STAR
;
2314 count
= current_state
->count
; /* Already matched */
2315 if (count
> 0) { ADD_ACTIVE(state_offset
+ dlen
+ 1, 0); }
2318 unsigned int otherd
= NOTACHAR
;
2322 if (utf
&& d
>= 128)
2325 otherd
= UCD_OTHERCASE(d
);
2326 #endif /* SUPPORT_UCP */
2329 #endif /* SUPPORT_UTF */
2330 otherd
= TABLE_GET(d
, fcc
, d
);
2332 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2335 (codevalue
== OP_POSPLUS
|| codevalue
== OP_NOTPOSPLUS
))
2337 active_count
--; /* Remove non-match possibility */
2338 next_active_state
--;
2341 ADD_NEW(state_offset
, count
);
2346 /*-----------------------------------------------------------------*/
2351 case OP_NOTMINQUERYI
:
2352 case OP_NOTPOSQUERYI
:
2354 codevalue
-= OP_STARI
- OP_STAR
;
2360 case OP_NOTMINQUERY
:
2361 case OP_NOTPOSQUERY
:
2362 ADD_ACTIVE(state_offset
+ dlen
+ 1, 0);
2365 unsigned int otherd
= NOTACHAR
;
2369 if (utf
&& d
>= 128)
2372 otherd
= UCD_OTHERCASE(d
);
2373 #endif /* SUPPORT_UCP */
2376 #endif /* SUPPORT_UTF */
2377 otherd
= TABLE_GET(d
, fcc
, d
);
2379 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2381 if (codevalue
== OP_POSQUERY
|| codevalue
== OP_NOTPOSQUERY
)
2383 active_count
--; /* Remove non-match possibility */
2384 next_active_state
--;
2386 ADD_NEW(state_offset
+ dlen
+ 1, 0);
2391 /*-----------------------------------------------------------------*/
2396 case OP_NOTMINSTARI
:
2397 case OP_NOTPOSSTARI
:
2399 codevalue
-= OP_STARI
- OP_STAR
;
2407 ADD_ACTIVE(state_offset
+ dlen
+ 1, 0);
2410 unsigned int otherd
= NOTACHAR
;
2414 if (utf
&& d
>= 128)
2417 otherd
= UCD_OTHERCASE(d
);
2418 #endif /* SUPPORT_UCP */
2421 #endif /* SUPPORT_UTF */
2422 otherd
= TABLE_GET(d
, fcc
, d
);
2424 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2426 if (codevalue
== OP_POSSTAR
|| codevalue
== OP_NOTPOSSTAR
)
2428 active_count
--; /* Remove non-match possibility */
2429 next_active_state
--;
2431 ADD_NEW(state_offset
, 0);
2436 /*-----------------------------------------------------------------*/
2440 codevalue
-= OP_STARI
- OP_STAR
;
2444 count
= current_state
->count
; /* Number already matched */
2447 unsigned int otherd
= NOTACHAR
;
2451 if (utf
&& d
>= 128)
2454 otherd
= UCD_OTHERCASE(d
);
2455 #endif /* SUPPORT_UCP */
2458 #endif /* SUPPORT_UTF */
2459 otherd
= TABLE_GET(d
, fcc
, d
);
2461 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2463 if (++count
>= GET2(code
, 1))
2464 { ADD_NEW(state_offset
+ dlen
+ 1 + IMM2_SIZE
, 0); }
2466 { ADD_NEW(state_offset
, count
); }
2471 /*-----------------------------------------------------------------*/
2476 case OP_NOTMINUPTOI
:
2477 case OP_NOTPOSUPTOI
:
2479 codevalue
-= OP_STARI
- OP_STAR
;
2487 ADD_ACTIVE(state_offset
+ dlen
+ 1 + IMM2_SIZE
, 0);
2488 count
= current_state
->count
; /* Number already matched */
2491 unsigned int otherd
= NOTACHAR
;
2495 if (utf
&& d
>= 128)
2498 otherd
= UCD_OTHERCASE(d
);
2499 #endif /* SUPPORT_UCP */
2502 #endif /* SUPPORT_UTF */
2503 otherd
= TABLE_GET(d
, fcc
, d
);
2505 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2507 if (codevalue
== OP_POSUPTO
|| codevalue
== OP_NOTPOSUPTO
)
2509 active_count
--; /* Remove non-match possibility */
2510 next_active_state
--;
2512 if (++count
>= GET2(code
, 1))
2513 { ADD_NEW(state_offset
+ dlen
+ 1 + IMM2_SIZE
, 0); }
2515 { ADD_NEW(state_offset
, count
); }
2521 /* ========================================================================== */
2522 /* These are the class-handling opcodes */
2528 BOOL isinclass
= FALSE
;
2529 int next_state_offset
;
2530 const pcre_uchar
*ecode
;
2532 /* For a simple class, there is always just a 32-byte table, and we
2533 can set isinclass from it. */
2535 if (codevalue
!= OP_XCLASS
)
2537 ecode
= code
+ 1 + (32 / sizeof(pcre_uchar
));
2540 isinclass
= (c
> 255)? (codevalue
== OP_NCLASS
) :
2541 ((((pcre_uint8
*)(code
+ 1))[c
/8] & (1 << (c
&7))) != 0);
2545 /* An extended class may have a table or a list of single characters,
2546 ranges, or both, and it may be positive or negative. There's a
2547 function that sorts all this out. */
2551 ecode
= code
+ GET(code
, 1);
2552 if (clen
> 0) isinclass
= PRIV(xclass
)(c
, code
+ 1 + LINK_SIZE
, utf
);
2555 /* At this point, isinclass is set for all kinds of class, and ecode
2556 points to the byte after the end of the class. If there is a
2557 quantifier, this is where it will be. */
2559 next_state_offset
= (int)(ecode
- start_code
);
2565 ADD_ACTIVE(next_state_offset
+ 1, 0);
2566 if (isinclass
) { ADD_NEW(state_offset
, 0); }
2571 count
= current_state
->count
; /* Already matched */
2572 if (count
> 0) { ADD_ACTIVE(next_state_offset
+ 1, 0); }
2573 if (isinclass
) { count
++; ADD_NEW(state_offset
, count
); }
2578 ADD_ACTIVE(next_state_offset
+ 1, 0);
2579 if (isinclass
) { ADD_NEW(next_state_offset
+ 1, 0); }
2584 count
= current_state
->count
; /* Already matched */
2585 if (count
>= GET2(ecode
, 1))
2586 { ADD_ACTIVE(next_state_offset
+ 1 + 2 * IMM2_SIZE
, 0); }
2589 int max
= GET2(ecode
, 1 + IMM2_SIZE
);
2590 if (++count
>= max
&& max
!= 0) /* Max 0 => no limit */
2591 { ADD_NEW(next_state_offset
+ 1 + 2 * IMM2_SIZE
, 0); }
2593 { ADD_NEW(state_offset
, count
); }
2598 if (isinclass
) { ADD_NEW(next_state_offset
, 0); }
2604 /* ========================================================================== */
2605 /* These are the opcodes for fancy brackets of various kinds. We have
2606 to use recursion in order to handle them. The "always failing" assertion
2607 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2608 though the other "backtracking verbs" are not supported. */
2611 forced_fail
++; /* Count FAILs for multiple states */
2617 case OP_ASSERTBACK_NOT
:
2620 int local_offsets
[2];
2621 int local_workspace
[1000];
2622 const pcre_uchar
*endasscode
= code
+ GET(code
, 1);
2624 while (*endasscode
== OP_ALT
) endasscode
+= GET(endasscode
, 1);
2626 rc
= internal_dfa_exec(
2627 md
, /* static match data */
2628 code
, /* this subexpression's code */
2629 ptr
, /* where we currently are */
2630 (int)(ptr
- start_subject
), /* start offset */
2631 local_offsets
, /* offset vector */
2632 sizeof(local_offsets
)/sizeof(int), /* size of same */
2633 local_workspace
, /* workspace vector */
2634 sizeof(local_workspace
)/sizeof(int), /* size of same */
2635 rlevel
); /* function recursion level */
2637 if (rc
== PCRE_ERROR_DFA_UITEM
) return rc
;
2638 if ((rc
>= 0) == (codevalue
== OP_ASSERT
|| codevalue
== OP_ASSERTBACK
))
2639 { ADD_ACTIVE((int)(endasscode
+ LINK_SIZE
+ 1 - start_code
), 0); }
2643 /*-----------------------------------------------------------------*/
2647 int local_offsets
[1000];
2648 int local_workspace
[1000];
2649 int codelink
= GET(code
, 1);
2652 /* Because of the way auto-callout works during compile, a callout item
2653 is inserted between OP_COND and an assertion condition. This does not
2654 happen for the other conditions. */
2656 if (code
[LINK_SIZE
+1] == OP_CALLOUT
)
2659 if (PUBL(callout
) != NULL
)
2661 PUBL(callout_block
) cb
;
2662 cb
.version
= 1; /* Version 1 of the callout block */
2663 cb
.callout_number
= code
[LINK_SIZE
+2];
2664 cb
.offset_vector
= offsets
;
2665 #ifdef COMPILE_PCRE8
2666 cb
.subject
= (PCRE_SPTR
)start_subject
;
2668 cb
.subject
= (PCRE_SPTR16
)start_subject
;
2670 cb
.subject_length
= (int)(end_subject
- start_subject
);
2671 cb
.start_match
= (int)(current_subject
- start_subject
);
2672 cb
.current_position
= (int)(ptr
- start_subject
);
2673 cb
.pattern_position
= GET(code
, LINK_SIZE
+ 3);
2674 cb
.next_item_length
= GET(code
, 3 + 2*LINK_SIZE
);
2676 cb
.capture_last
= -1;
2677 cb
.callout_data
= md
->callout_data
;
2678 cb
.mark
= NULL
; /* No (*MARK) support */
2679 if ((rrc
= (*PUBL(callout
))(&cb
)) < 0) return rrc
; /* Abandon */
2681 if (rrc
> 0) break; /* Fail this thread */
2682 code
+= PRIV(OP_lengths
)[OP_CALLOUT
]; /* Skip callout data */
2685 condcode
= code
[LINK_SIZE
+1];
2687 /* Back reference conditions are not supported */
2689 if (condcode
== OP_CREF
|| condcode
== OP_NCREF
)
2690 return PCRE_ERROR_DFA_UCOND
;
2692 /* The DEFINE condition is always false */
2694 if (condcode
== OP_DEF
)
2695 { ADD_ACTIVE(state_offset
+ codelink
+ LINK_SIZE
+ 1, 0); }
2697 /* The only supported version of OP_RREF is for the value RREF_ANY,
2698 which means "test if in any recursion". We can't test for specifically
2701 else if (condcode
== OP_RREF
|| condcode
== OP_NRREF
)
2703 int value
= GET2(code
, LINK_SIZE
+ 2);
2704 if (value
!= RREF_ANY
) return PCRE_ERROR_DFA_UCOND
;
2705 if (md
->recursive
!= NULL
)
2706 { ADD_ACTIVE(state_offset
+ LINK_SIZE
+ 2 + IMM2_SIZE
, 0); }
2707 else { ADD_ACTIVE(state_offset
+ codelink
+ LINK_SIZE
+ 1, 0); }
2710 /* Otherwise, the condition is an assertion */
2715 const pcre_uchar
*asscode
= code
+ LINK_SIZE
+ 1;
2716 const pcre_uchar
*endasscode
= asscode
+ GET(asscode
, 1);
2718 while (*endasscode
== OP_ALT
) endasscode
+= GET(endasscode
, 1);
2720 rc
= internal_dfa_exec(
2721 md
, /* fixed match data */
2722 asscode
, /* this subexpression's code */
2723 ptr
, /* where we currently are */
2724 (int)(ptr
- start_subject
), /* start offset */
2725 local_offsets
, /* offset vector */
2726 sizeof(local_offsets
)/sizeof(int), /* size of same */
2727 local_workspace
, /* workspace vector */
2728 sizeof(local_workspace
)/sizeof(int), /* size of same */
2729 rlevel
); /* function recursion level */
2731 if (rc
== PCRE_ERROR_DFA_UITEM
) return rc
;
2733 (condcode
== OP_ASSERT
|| condcode
== OP_ASSERTBACK
))
2734 { ADD_ACTIVE((int)(endasscode
+ LINK_SIZE
+ 1 - start_code
), 0); }
2736 { ADD_ACTIVE(state_offset
+ codelink
+ LINK_SIZE
+ 1, 0); }
2741 /*-----------------------------------------------------------------*/
2744 dfa_recursion_info
*ri
;
2745 int local_offsets
[1000];
2746 int local_workspace
[1000];
2747 const pcre_uchar
*callpat
= start_code
+ GET(code
, 1);
2748 int recno
= (callpat
== md
->start_code
)? 0 :
2749 GET2(callpat
, 1 + LINK_SIZE
);
2752 DPRINTF(("%.*sStarting regex recursion\n", rlevel
*2-2, SP
));
2754 /* Check for repeating a recursion without advancing the subject
2755 pointer. This should catch convoluted mutual recursions. (Some simple
2756 cases are caught at compile time.) */
2758 for (ri
= md
->recursive
; ri
!= NULL
; ri
= ri
->prevrec
)
2759 if (recno
== ri
->group_num
&& ptr
== ri
->subject_position
)
2760 return PCRE_ERROR_RECURSELOOP
;
2762 /* Remember this recursion and where we started it so as to
2763 catch infinite loops. */
2765 new_recursive
.group_num
= recno
;
2766 new_recursive
.subject_position
= ptr
;
2767 new_recursive
.prevrec
= md
->recursive
;
2768 md
->recursive
= &new_recursive
;
2770 rc
= internal_dfa_exec(
2771 md
, /* fixed match data */
2772 callpat
, /* this subexpression's code */
2773 ptr
, /* where we currently are */
2774 (int)(ptr
- start_subject
), /* start offset */
2775 local_offsets
, /* offset vector */
2776 sizeof(local_offsets
)/sizeof(int), /* size of same */
2777 local_workspace
, /* workspace vector */
2778 sizeof(local_workspace
)/sizeof(int), /* size of same */
2779 rlevel
); /* function recursion level */
2781 md
->recursive
= new_recursive
.prevrec
; /* Done this recursion */
2783 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel
*2-2, SP
,
2786 /* Ran out of internal offsets */
2788 if (rc
== 0) return PCRE_ERROR_DFA_RECURSE
;
2790 /* For each successful matched substring, set up the next state with a
2791 count of characters to skip before trying it. Note that the count is in
2792 characters, not bytes. */
2796 for (rc
= rc
*2 - 2; rc
>= 0; rc
-= 2)
2798 int charcount
= local_offsets
[rc
+1] - local_offsets
[rc
];
2802 const pcre_uchar
*p
= start_subject
+ local_offsets
[rc
];
2803 const pcre_uchar
*pp
= start_subject
+ local_offsets
[rc
+1];
2804 while (p
< pp
) if (NOT_FIRSTCHAR(*p
++)) charcount
--;
2809 ADD_NEW_DATA(-(state_offset
+ LINK_SIZE
+ 1), 0, (charcount
- 1));
2813 ADD_ACTIVE(state_offset
+ LINK_SIZE
+ 1, 0);
2817 else if (rc
!= PCRE_ERROR_NOMATCH
) return rc
;
2821 /*-----------------------------------------------------------------*/
2828 int charcount
, matched_count
;
2829 const pcre_uchar
*local_ptr
= ptr
;
2832 if (codevalue
== OP_BRAPOSZERO
)
2835 codevalue
= *(++code
); /* Codevalue will be one of above BRAs */
2837 else allow_zero
= FALSE
;
2839 /* Loop to match the subpattern as many times as possible as if it were
2840 a complete pattern. */
2842 for (matched_count
= 0;; matched_count
++)
2844 int local_offsets
[2];
2845 int local_workspace
[1000];
2847 int rc
= internal_dfa_exec(
2848 md
, /* fixed match data */
2849 code
, /* this subexpression's code */
2850 local_ptr
, /* where we currently are */
2851 (int)(ptr
- start_subject
), /* start offset */
2852 local_offsets
, /* offset vector */
2853 sizeof(local_offsets
)/sizeof(int), /* size of same */
2854 local_workspace
, /* workspace vector */
2855 sizeof(local_workspace
)/sizeof(int), /* size of same */
2856 rlevel
); /* function recursion level */
2858 /* Failed to match */
2862 if (rc
!= PCRE_ERROR_NOMATCH
) return rc
;
2866 /* Matched: break the loop if zero characters matched. */
2868 charcount
= local_offsets
[1] - local_offsets
[0];
2869 if (charcount
== 0) break;
2870 local_ptr
+= charcount
; /* Advance temporary position ptr */
2873 /* At this point we have matched the subpattern matched_count
2874 times, and local_ptr is pointing to the character after the end of the
2877 if (matched_count
> 0 || allow_zero
)
2879 const pcre_uchar
*end_subpattern
= code
;
2880 int next_state_offset
;
2882 do { end_subpattern
+= GET(end_subpattern
, 1); }
2883 while (*end_subpattern
== OP_ALT
);
2885 (int)(end_subpattern
- start_code
+ LINK_SIZE
+ 1);
2887 /* Optimization: if there are no more active states, and there
2888 are no new states yet set up, then skip over the subject string
2889 right here, to save looping. Otherwise, set up the new state to swing
2890 into action when the end of the matched substring is reached. */
2892 if (i
+ 1 >= active_count
&& new_count
== 0)
2896 ADD_NEW(next_state_offset
, 0);
2900 const pcre_uchar
*p
= ptr
;
2901 const pcre_uchar
*pp
= local_ptr
;
2902 charcount
= (int)(pp
- p
);
2904 if (utf
) while (p
< pp
) if (NOT_FIRSTCHAR(*p
++)) charcount
--;
2906 ADD_NEW_DATA(-next_state_offset
, 0, (charcount
- 1));
2912 /*-----------------------------------------------------------------*/
2916 int local_offsets
[2];
2917 int local_workspace
[1000];
2919 int rc
= internal_dfa_exec(
2920 md
, /* fixed match data */
2921 code
, /* this subexpression's code */
2922 ptr
, /* where we currently are */
2923 (int)(ptr
- start_subject
), /* start offset */
2924 local_offsets
, /* offset vector */
2925 sizeof(local_offsets
)/sizeof(int), /* size of same */
2926 local_workspace
, /* workspace vector */
2927 sizeof(local_workspace
)/sizeof(int), /* size of same */
2928 rlevel
); /* function recursion level */
2932 const pcre_uchar
*end_subpattern
= code
;
2933 int charcount
= local_offsets
[1] - local_offsets
[0];
2934 int next_state_offset
, repeat_state_offset
;
2936 do { end_subpattern
+= GET(end_subpattern
, 1); }
2937 while (*end_subpattern
== OP_ALT
);
2939 (int)(end_subpattern
- start_code
+ LINK_SIZE
+ 1);
2941 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2942 arrange for the repeat state also to be added to the relevant list.
2943 Calculate the offset, or set -1 for no repeat. */
2945 repeat_state_offset
= (*end_subpattern
== OP_KETRMAX
||
2946 *end_subpattern
== OP_KETRMIN
)?
2947 (int)(end_subpattern
- start_code
- GET(end_subpattern
, 1)) : -1;
2949 /* If we have matched an empty string, add the next state at the
2950 current character pointer. This is important so that the duplicate
2951 checking kicks in, which is what breaks infinite loops that match an
2956 ADD_ACTIVE(next_state_offset
, 0);
2959 /* Optimization: if there are no more active states, and there
2960 are no new states yet set up, then skip over the subject string
2961 right here, to save looping. Otherwise, set up the new state to swing
2962 into action when the end of the matched substring is reached. */
2964 else if (i
+ 1 >= active_count
&& new_count
== 0)
2968 ADD_NEW(next_state_offset
, 0);
2970 /* If we are adding a repeat state at the new character position,
2971 we must fudge things so that it is the only current state.
2972 Otherwise, it might be a duplicate of one we processed before, and
2973 that would cause it to be skipped. */
2975 if (repeat_state_offset
>= 0)
2977 next_active_state
= active_states
;
2980 ADD_ACTIVE(repeat_state_offset
, 0);
2988 const pcre_uchar
*p
= start_subject
+ local_offsets
[0];
2989 const pcre_uchar
*pp
= start_subject
+ local_offsets
[1];
2990 while (p
< pp
) if (NOT_FIRSTCHAR(*p
++)) charcount
--;
2993 ADD_NEW_DATA(-next_state_offset
, 0, (charcount
- 1));
2994 if (repeat_state_offset
>= 0)
2995 { ADD_NEW_DATA(-repeat_state_offset
, 0, (charcount
- 1)); }
2998 else if (rc
!= PCRE_ERROR_NOMATCH
) return rc
;
3003 /* ========================================================================== */
3004 /* Handle callouts */
3008 if (PUBL(callout
) != NULL
)
3010 PUBL(callout_block
) cb
;
3011 cb
.version
= 1; /* Version 1 of the callout block */
3012 cb
.callout_number
= code
[1];
3013 cb
.offset_vector
= offsets
;
3014 #ifdef COMPILE_PCRE8
3015 cb
.subject
= (PCRE_SPTR
)start_subject
;
3017 cb
.subject
= (PCRE_SPTR16
)start_subject
;
3019 cb
.subject_length
= (int)(end_subject
- start_subject
);
3020 cb
.start_match
= (int)(current_subject
- start_subject
);
3021 cb
.current_position
= (int)(ptr
- start_subject
);
3022 cb
.pattern_position
= GET(code
, 2);
3023 cb
.next_item_length
= GET(code
, 2 + LINK_SIZE
);
3025 cb
.capture_last
= -1;
3026 cb
.callout_data
= md
->callout_data
;
3027 cb
.mark
= NULL
; /* No (*MARK) support */
3028 if ((rrc
= (*PUBL(callout
))(&cb
)) < 0) return rrc
; /* Abandon */
3031 { ADD_ACTIVE(state_offset
+ PRIV(OP_lengths
)[OP_CALLOUT
], 0); }
3035 /* ========================================================================== */
3036 default: /* Unsupported opcode */
3037 return PCRE_ERROR_DFA_UITEM
;
3040 NEXT_ACTIVE_STATE
: continue;
3042 } /* End of loop scanning active states */
3044 /* We have finished the processing at the current subject character. If no
3045 new states have been set for the next character, we have found all the
3046 matches that we are going to find. If we are at the top level and partial
3047 matching has been requested, check for appropriate conditions.
3049 The "forced_ fail" variable counts the number of (*F) encountered for the
3050 character. If it is equal to the original active_count (saved in
3051 workspace[1]) it means that (*F) was found on every active state. In this
3052 case we don't want to give a partial match.
3054 The "could_continue" variable is true if a state could have continued but
3055 for the fact that the end of the subject was reached. */
3059 if (rlevel
== 1 && /* Top level, and */
3060 could_continue
&& /* Some could go on, and */
3061 forced_fail
!= workspace
[1] && /* Not all forced fail & */
3063 (md
->moptions
& PCRE_PARTIAL_HARD
) != 0 /* Hard partial */
3065 ((md
->moptions
& PCRE_PARTIAL_SOFT
) != 0 && /* Soft partial and */
3066 match_count
< 0) /* no matches */
3069 partial_newline
|| /* Either partial NL */
3071 ptr
>= end_subject
&& /* End of subject and */
3072 ptr
> md
->start_used_ptr
) /* Inspected non-empty string */
3076 if (offsetcount
>= 2)
3078 offsets
[0] = (int)(md
->start_used_ptr
- start_subject
);
3079 offsets
[1] = (int)(end_subject
- start_subject
);
3081 match_count
= PCRE_ERROR_PARTIAL
;
3084 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3085 "%.*s---------------------\n\n", rlevel
*2-2, SP
, rlevel
, match_count
,
3087 break; /* In effect, "return", but see the comment below */
3090 /* One or more states are active for the next character. */
3092 ptr
+= clen
; /* Advance to next subject character */
3093 } /* Loop to move along the subject string */
3095 /* Control gets here from "break" a few lines above. We do it this way because
3096 if we use "return" above, we have compiler trouble. Some compilers warn if
3097 there's nothing here because they think the function doesn't return a value. On
3098 the other hand, if we put a dummy statement here, some more clever compilers
3099 complain that it can't be reached. Sigh. */
3107 /*************************************************
3108 * Execute a Regular Expression - DFA engine *
3109 *************************************************/
3111 /* This external function applies a compiled re to a subject string using a DFA
3112 engine. This function calls the internal function multiple times if the pattern
3116 argument_re points to the compiled expression
3117 extra_data points to extra data or is NULL
3118 subject points to the subject string
3119 length length of subject string (may contain binary zeros)
3120 start_offset where to start in the subject string
3122 offsets vector of match offsets
3123 offsetcount size of same
3124 workspace workspace vector
3125 wscount size of same
3127 Returns: > 0 => number of match offset pairs placed in offsets
3128 = 0 => offsets overflowed; longest matches are present
3129 -1 => failed to match
3130 < -1 => some kind of unexpected problem
3133 #ifdef COMPILE_PCRE8
3134 PCRE_EXP_DEFN
int PCRE_CALL_CONVENTION
3135 pcre_dfa_exec(const pcre
*argument_re
, const pcre_extra
*extra_data
,
3136 const char *subject
, int length
, int start_offset
, int options
, int *offsets
,
3137 int offsetcount
, int *workspace
, int wscount
)
3139 PCRE_EXP_DEFN
int PCRE_CALL_CONVENTION
3140 pcre16_dfa_exec(const pcre16
*argument_re
, const pcre16_extra
*extra_data
,
3141 PCRE_SPTR16 subject
, int length
, int start_offset
, int options
, int *offsets
,
3142 int offsetcount
, int *workspace
, int wscount
)
3145 REAL_PCRE
*re
= (REAL_PCRE
*)argument_re
;
3146 dfa_match_data match_block
;
3147 dfa_match_data
*md
= &match_block
;
3148 BOOL utf
, anchored
, startline
, firstline
;
3149 const pcre_uchar
*current_subject
, *end_subject
;
3150 const pcre_study_data
*study
= NULL
;
3152 const pcre_uchar
*req_char_ptr
;
3153 const pcre_uint8
*start_bits
= NULL
;
3154 BOOL has_first_char
= FALSE
;
3155 BOOL has_req_char
= FALSE
;
3156 pcre_uchar first_char
= 0;
3157 pcre_uchar first_char2
= 0;
3158 pcre_uchar req_char
= 0;
3159 pcre_uchar req_char2
= 0;
3162 /* Plausibility checks */
3164 if ((options
& ~PUBLIC_DFA_EXEC_OPTIONS
) != 0) return PCRE_ERROR_BADOPTION
;
3165 if (re
== NULL
|| subject
== NULL
|| workspace
== NULL
||
3166 (offsets
== NULL
&& offsetcount
> 0)) return PCRE_ERROR_NULL
;
3167 if (offsetcount
< 0) return PCRE_ERROR_BADCOUNT
;
3168 if (wscount
< 20) return PCRE_ERROR_DFA_WSSIZE
;
3169 if (start_offset
< 0 || start_offset
> length
) return PCRE_ERROR_BADOFFSET
;
3171 /* Check that the first field in the block is the magic number. If it is not,
3172 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3173 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3174 means that the pattern is likely compiled with different endianness. */
3176 if (re
->magic_number
!= MAGIC_NUMBER
)
3177 return re
->magic_number
== REVERSED_MAGIC_NUMBER
?
3178 PCRE_ERROR_BADENDIANNESS
:PCRE_ERROR_BADMAGIC
;
3179 if ((re
->flags
& PCRE_MODE
) == 0) return PCRE_ERROR_BADMODE
;
3181 /* If restarting after a partial match, do some sanity checks on the contents
3182 of the workspace. */
3184 if ((options
& PCRE_DFA_RESTART
) != 0)
3186 if ((workspace
[0] & (-2)) != 0 || workspace
[1] < 1 ||
3187 workspace
[1] > (wscount
- 2)/INTS_PER_STATEBLOCK
)
3188 return PCRE_ERROR_DFA_BADRESTART
;
3191 /* Set up study, callout, and table data */
3193 md
->tables
= re
->tables
;
3194 md
->callout_data
= NULL
;
3196 if (extra_data
!= NULL
)
3198 unsigned int flags
= extra_data
->flags
;
3199 if ((flags
& PCRE_EXTRA_STUDY_DATA
) != 0)
3200 study
= (const pcre_study_data
*)extra_data
->study_data
;
3201 if ((flags
& PCRE_EXTRA_MATCH_LIMIT
) != 0) return PCRE_ERROR_DFA_UMLIMIT
;
3202 if ((flags
& PCRE_EXTRA_MATCH_LIMIT_RECURSION
) != 0)
3203 return PCRE_ERROR_DFA_UMLIMIT
;
3204 if ((flags
& PCRE_EXTRA_CALLOUT_DATA
) != 0)
3205 md
->callout_data
= extra_data
->callout_data
;
3206 if ((flags
& PCRE_EXTRA_TABLES
) != 0)
3207 md
->tables
= extra_data
->tables
;
3210 /* Set some local values */
3212 current_subject
= (const pcre_uchar
*)subject
+ start_offset
;
3213 end_subject
= (const pcre_uchar
*)subject
+ length
;
3214 req_char_ptr
= current_subject
- 1;
3217 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3218 utf
= (re
->options
& PCRE_UTF8
) != 0;
3223 anchored
= (options
& (PCRE_ANCHORED
|PCRE_DFA_RESTART
)) != 0 ||
3224 (re
->options
& PCRE_ANCHORED
) != 0;
3226 /* The remaining fixed data for passing around. */
3228 md
->start_code
= (const pcre_uchar
*)argument_re
+
3229 re
->name_table_offset
+ re
->name_count
* re
->name_entry_size
;
3230 md
->start_subject
= (const pcre_uchar
*)subject
;
3231 md
->end_subject
= end_subject
;
3232 md
->start_offset
= start_offset
;
3233 md
->moptions
= options
;
3234 md
->poptions
= re
->options
;
3236 /* If the BSR option is not set at match time, copy what was set
3239 if ((md
->moptions
& (PCRE_BSR_ANYCRLF
|PCRE_BSR_UNICODE
)) == 0)
3241 if ((re
->options
& (PCRE_BSR_ANYCRLF
|PCRE_BSR_UNICODE
)) != 0)
3242 md
->moptions
|= re
->options
& (PCRE_BSR_ANYCRLF
|PCRE_BSR_UNICODE
);
3244 else md
->moptions
|= PCRE_BSR_ANYCRLF
;
3248 /* Handle different types of newline. The three bits give eight cases. If
3249 nothing is set at run time, whatever was used at compile time applies. */
3251 switch ((((options
& PCRE_NEWLINE_BITS
) == 0)? re
->options
: (pcre_uint32
)options
) &
3254 case 0: newline
= NEWLINE
; break; /* Compile-time default */
3255 case PCRE_NEWLINE_CR
: newline
= CHAR_CR
; break;
3256 case PCRE_NEWLINE_LF
: newline
= CHAR_NL
; break;
3257 case PCRE_NEWLINE_CR
+
3258 PCRE_NEWLINE_LF
: newline
= (CHAR_CR
<< 8) | CHAR_NL
; break;
3259 case PCRE_NEWLINE_ANY
: newline
= -1; break;
3260 case PCRE_NEWLINE_ANYCRLF
: newline
= -2; break;
3261 default: return PCRE_ERROR_BADNEWLINE
;
3266 md
->nltype
= NLTYPE_ANYCRLF
;
3268 else if (newline
< 0)
3270 md
->nltype
= NLTYPE_ANY
;
3274 md
->nltype
= NLTYPE_FIXED
;
3278 md
->nl
[0] = (newline
>> 8) & 255;
3279 md
->nl
[1] = newline
& 255;
3284 md
->nl
[0] = newline
;
3288 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3289 back the character offset. */
3292 if (utf
&& (options
& PCRE_NO_UTF8_CHECK
) == 0)
3295 int errorcode
= PRIV(valid_utf
)((pcre_uchar
*)subject
, length
, &erroroffset
);
3298 if (offsetcount
>= 2)
3300 offsets
[0] = erroroffset
;
3301 offsets
[1] = errorcode
;
3303 return (errorcode
<= PCRE_UTF8_ERR5
&& (options
& PCRE_PARTIAL_HARD
) != 0)?
3304 PCRE_ERROR_SHORTUTF8
: PCRE_ERROR_BADUTF8
;
3306 if (start_offset
> 0 && start_offset
< length
&&
3307 NOT_FIRSTCHAR(((PCRE_PUCHAR
)subject
)[start_offset
]))
3308 return PCRE_ERROR_BADUTF8_OFFSET
;
3312 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3313 is a feature that makes it possible to save compiled regex and re-use them
3314 in other programs later. */
3316 if (md
->tables
== NULL
) md
->tables
= PRIV(default_tables
);
3318 /* The "must be at the start of a line" flags are used in a loop when finding
3321 startline
= (re
->flags
& PCRE_STARTLINE
) != 0;
3322 firstline
= (re
->options
& PCRE_FIRSTLINE
) != 0;
3324 /* Set up the first character to match, if available. The first_byte value is
3325 never set for an anchored regular expression, but the anchoring may be forced
3326 at run time, so we have to test for anchoring. The first char may be unset for
3327 an unanchored pattern, of course. If there's no first char and the pattern was
3328 studied, there may be a bitmap of possible first characters. */
3332 if ((re
->flags
& PCRE_FIRSTSET
) != 0)
3334 has_first_char
= TRUE
;
3335 first_char
= first_char2
= (pcre_uchar
)(re
->first_char
);
3336 if ((re
->flags
& PCRE_FCH_CASELESS
) != 0)
3338 first_char2
= TABLE_GET(first_char
, md
->tables
+ fcc_offset
, first_char
);
3339 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3340 if (utf
&& first_char
> 127)
3341 first_char2
= UCD_OTHERCASE(first_char
);
3347 if (!startline
&& study
!= NULL
&&
3348 (study
->flags
& PCRE_STUDY_MAPPED
) != 0)
3349 start_bits
= study
->start_bits
;
3353 /* For anchored or unanchored matches, there may be a "last known required
3356 if ((re
->flags
& PCRE_REQCHSET
) != 0)
3358 has_req_char
= TRUE
;
3359 req_char
= req_char2
= (pcre_uchar
)(re
->req_char
);
3360 if ((re
->flags
& PCRE_RCH_CASELESS
) != 0)
3362 req_char2
= TABLE_GET(req_char
, md
->tables
+ fcc_offset
, req_char
);
3363 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3364 if (utf
&& req_char
> 127)
3365 req_char2
= UCD_OTHERCASE(req_char
);
3370 /* Call the main matching function, looping for a non-anchored regex after a
3371 failed match. If not restarting, perform certain optimizations at the start of
3378 if ((options
& PCRE_DFA_RESTART
) == 0)
3380 const pcre_uchar
*save_end_subject
= end_subject
;
3382 /* If firstline is TRUE, the start of the match is constrained to the first
3383 line of a multiline string. Implement this by temporarily adjusting
3384 end_subject so that we stop scanning at a newline. If the match fails at
3385 the newline, later code breaks this loop. */
3389 PCRE_PUCHAR t
= current_subject
;
3393 while (t
< md
->end_subject
&& !IS_NEWLINE(t
))
3396 ACROSSCHAR(t
< end_subject
, *t
, t
++);
3401 while (t
< md
->end_subject
&& !IS_NEWLINE(t
)) t
++;
3405 /* There are some optimizations that avoid running the match if a known
3406 starting point is not found. However, there is an option that disables
3407 these, for testing and for ensuring that all callouts do actually occur.
3408 The option can be set in the regex by (*NO_START_OPT) or passed in
3409 match-time options. */
3411 if (((options
| re
->options
) & PCRE_NO_START_OPTIMIZE
) == 0)
3413 /* Advance to a known first char. */
3417 if (first_char
!= first_char2
)
3418 while (current_subject
< end_subject
&&
3419 *current_subject
!= first_char
&& *current_subject
!= first_char2
)
3422 while (current_subject
< end_subject
&&
3423 *current_subject
!= first_char
)
3427 /* Or to just after a linebreak for a multiline match if possible */
3431 if (current_subject
> md
->start_subject
+ start_offset
)
3436 while (current_subject
< end_subject
&&
3437 !WAS_NEWLINE(current_subject
))
3440 ACROSSCHAR(current_subject
< end_subject
, *current_subject
,
3446 while (current_subject
< end_subject
&& !WAS_NEWLINE(current_subject
))
3449 /* If we have just passed a CR and the newline option is ANY or
3450 ANYCRLF, and we are now at a LF, advance the match position by one
3453 if (current_subject
[-1] == CHAR_CR
&&
3454 (md
->nltype
== NLTYPE_ANY
|| md
->nltype
== NLTYPE_ANYCRLF
) &&
3455 current_subject
< end_subject
&&
3456 *current_subject
== CHAR_NL
)
3461 /* Or to a non-unique first char after study */
3463 else if (start_bits
!= NULL
)
3465 while (current_subject
< end_subject
)
3467 unsigned int c
= *current_subject
;
3468 #ifndef COMPILE_PCRE8
3469 if (c
> 255) c
= 255;
3471 if ((start_bits
[c
/8] & (1 << (c
&7))) == 0)
3474 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3475 /* In non 8-bit mode, the iteration will stop for
3476 characters > 255 at the beginning or not stop at all. */
3478 ACROSSCHAR(current_subject
< end_subject
, *current_subject
,
3487 /* Restore fudged end_subject */
3489 end_subject
= save_end_subject
;
3491 /* The following two optimizations are disabled for partial matching or if
3492 disabling is explicitly requested (and of course, by the test above, this
3493 code is not obeyed when restarting after a partial match). */
3495 if (((options
| re
->options
) & PCRE_NO_START_OPTIMIZE
) == 0 &&
3496 (options
& (PCRE_PARTIAL_HARD
|PCRE_PARTIAL_SOFT
)) == 0)
3498 /* If the pattern was studied, a minimum subject length may be set. This
3499 is a lower bound; no actual string of that length may actually match the
3500 pattern. Although the value is, strictly, in characters, we treat it as
3501 bytes to avoid spending too much time in this optimization. */
3503 if (study
!= NULL
&& (study
->flags
& PCRE_STUDY_MINLEN
) != 0 &&
3504 (pcre_uint32
)(end_subject
- current_subject
) < study
->minlength
)
3505 return PCRE_ERROR_NOMATCH
;
3507 /* If req_char is set, we know that that character must appear in the
3508 subject for the match to succeed. If the first character is set, req_char
3509 must be later in the subject; otherwise the test starts at the match
3510 point. This optimization can save a huge amount of work in patterns with
3511 nested unlimited repeats that aren't going to match. Writing separate
3512 code for cased/caseless versions makes it go faster, as does using an
3513 autoincrement and backing off on a match.
3515 HOWEVER: when the subject string is very, very long, searching to its end
3516 can take a long time, and give bad performance on quite ordinary
3517 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3518 string... so we don't do this when the string is sufficiently long. */
3520 if (has_req_char
&& end_subject
- current_subject
< REQ_BYTE_MAX
)
3522 PCRE_PUCHAR p
= current_subject
+ (has_first_char
? 1:0);
3524 /* We don't need to repeat the search if we haven't yet reached the
3525 place we found it at last time. */
3527 if (p
> req_char_ptr
)
3529 if (req_char
!= req_char2
)
3531 while (p
< end_subject
)
3534 if (pp
== req_char
|| pp
== req_char2
) { p
--; break; }
3539 while (p
< end_subject
)
3541 if (*p
++ == req_char
) { p
--; break; }
3545 /* If we can't find the required character, break the matching loop,
3546 which will cause a return or PCRE_ERROR_NOMATCH. */
3548 if (p
>= end_subject
) break;
3550 /* If we have found the required character, save the point where we
3551 found it, so that we don't search again next time round the loop if
3552 the start hasn't passed this character yet. */
3558 } /* End of optimizations that are done when not restarting */
3560 /* OK, now we can do the business */
3562 md
->start_used_ptr
= current_subject
;
3563 md
->recursive
= NULL
;
3565 rc
= internal_dfa_exec(
3566 md
, /* fixed match data */
3567 md
->start_code
, /* this subexpression's code */
3568 current_subject
, /* where we currently are */
3569 start_offset
, /* start offset in subject */
3570 offsets
, /* offset vector */
3571 offsetcount
, /* size of same */
3572 workspace
, /* workspace vector */
3573 wscount
, /* size of same */
3574 0); /* function recurse level */
3576 /* Anything other than "no match" means we are done, always; otherwise, carry
3577 on only if not anchored. */
3579 if (rc
!= PCRE_ERROR_NOMATCH
|| anchored
) return rc
;
3581 /* Advance to the next subject character unless we are at the end of a line
3582 and firstline is set. */
3584 if (firstline
&& IS_NEWLINE(current_subject
)) break;
3589 ACROSSCHAR(current_subject
< end_subject
, *current_subject
,
3593 if (current_subject
> end_subject
) break;
3595 /* If we have just passed a CR and we are now at a LF, and the pattern does
3596 not contain any explicit matches for \r or \n, and the newline option is CRLF
3597 or ANY or ANYCRLF, advance the match position by one more character. */
3599 if (current_subject
[-1] == CHAR_CR
&&
3600 current_subject
< end_subject
&&
3601 *current_subject
== CHAR_NL
&&
3602 (re
->flags
& PCRE_HASCRORLF
) == 0 &&
3603 (md
->nltype
== NLTYPE_ANY
||
3604 md
->nltype
== NLTYPE_ANYCRLF
||
3608 } /* "Bumpalong" loop */
3610 return PCRE_ERROR_NOMATCH
;
3613 /* End of pcre_dfa_exec.c */