1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
9 Written by Philip Hazel
10 Copyright (c) 1997-2012 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
77 #define NLBLOCK md /* Block containing newline information */
78 #define PSSTART start_subject /* Field containing processed string start */
79 #define PSEND end_subject /* Field containing processed string end */
81 #include "pcre_internal.h"
84 /* For use to indent debugging output */
89 /*************************************************
90 * Code parameters and static tables *
91 *************************************************/
93 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
94 into others, under special conditions. A gap of 20 between the blocks should be
95 enough. The resulting opcodes don't have to be less than 256 because they are
96 never stored, so we push them well clear of the normal opcodes. */
98 #define OP_PROP_EXTRA 300
99 #define OP_EXTUNI_EXTRA 320
100 #define OP_ANYNL_EXTRA 340
101 #define OP_HSPACE_EXTRA 360
102 #define OP_VSPACE_EXTRA 380
105 /* This table identifies those opcodes that are followed immediately by a
106 character that is to be tested in some way. This makes it possible to
107 centralize the loading of these characters. In the case of Type * etc, the
108 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
109 small value. Non-zero values in the table are the offsets from the opcode where
110 the character is to be found. ***NOTE*** If the start of this table is
111 modified, the three tables that follow must also be modified. */
113 static const pcre_uint8 coptable
[] = {
115 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
116 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
117 0, 0, 0, /* Any, AllAny, Anybyte */
119 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
121 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
126 /* Positive single-char repeats */
127 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
128 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* upto, minupto */
129 1+IMM2_SIZE
, /* exact */
130 1, 1, 1, 1+IMM2_SIZE
, /* *+, ++, ?+, upto+ */
131 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
132 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* upto I, minupto I */
133 1+IMM2_SIZE
, /* exact I */
134 1, 1, 1, 1+IMM2_SIZE
, /* *+I, ++I, ?+I, upto+I */
135 /* Negative single-char repeats - only for chars < 256 */
136 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
137 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* NOT upto, minupto */
138 1+IMM2_SIZE
, /* NOT exact */
139 1, 1, 1, 1+IMM2_SIZE
, /* NOT *+, ++, ?+, upto+ */
140 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* NOT upto I, minupto I */
142 1+IMM2_SIZE
, /* NOT exact I */
143 1, 1, 1, 1+IMM2_SIZE
, /* NOT *+I, ++I, ?+I, upto+I */
144 /* Positive type repeats */
145 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
146 1+IMM2_SIZE
, 1+IMM2_SIZE
, /* Type upto, minupto */
147 1+IMM2_SIZE
, /* Type exact */
148 1, 1, 1, 1+IMM2_SIZE
, /* Type *+, ++, ?+, upto+ */
149 /* Character class & ref repeats */
150 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
151 0, 0, /* CRRANGE, CRMINRANGE */
154 0, /* XCLASS - variable length */
167 0, /* Assert behind */
168 0, /* Assert behind not */
169 0, 0, /* ONCE, ONCE_NC */
170 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
171 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
172 0, 0, /* CREF, NCREF */
173 0, 0, /* RREF, NRREF */
175 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
176 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
177 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
178 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
179 0, 0 /* CLOSE, SKIPZERO */
182 /* This table identifies those opcodes that inspect a character. It is used to
183 remember the fact that a character could have been inspected when the end of
184 the subject is reached. ***NOTE*** If the start of this table is modified, the
185 two tables that follow must also be modified. */
187 static const pcre_uint8 poptable
[] = {
189 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
190 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
191 1, 1, 1, /* Any, AllAny, Anybyte */
193 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
195 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
200 /* Positive single-char repeats */
201 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
202 1, 1, 1, /* upto, minupto, exact */
203 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
204 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
205 1, 1, 1, /* upto I, minupto I, exact I */
206 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
207 /* Negative single-char repeats - only for chars < 256 */
208 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
209 1, 1, 1, /* NOT upto, minupto, exact */
210 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
211 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
212 1, 1, 1, /* NOT upto I, minupto I, exact I */
213 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
214 /* Positive type repeats */
215 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
216 1, 1, 1, /* Type upto, minupto, exact */
217 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
218 /* Character class & ref repeats */
219 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
220 1, 1, /* CRRANGE, CRMINRANGE */
223 1, /* XCLASS - variable length */
236 0, /* Assert behind */
237 0, /* Assert behind not */
238 0, 0, /* ONCE, ONCE_NC */
239 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
240 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
241 0, 0, /* CREF, NCREF */
242 0, 0, /* RREF, NRREF */
244 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
245 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
246 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
247 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
248 0, 0 /* CLOSE, SKIPZERO */
251 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254 static const pcre_uint8 toptable1
[] = {
256 ctype_digit
, ctype_digit
,
257 ctype_space
, ctype_space
,
258 ctype_word
, ctype_word
,
259 0, 0 /* OP_ANY, OP_ALLANY */
262 static const pcre_uint8 toptable2
[] = {
267 1, 1 /* OP_ANY, OP_ALLANY */
271 /* Structure for holding data about a particular state, which is in effect the
272 current data for an active path through the match tree. It must consist
273 entirely of ints because the working vector we are passed, and which we put
274 these structures in, is a vector of ints. */
276 typedef struct stateblock
{
277 int offset
; /* Offset to opcode */
278 int count
; /* Count for repeats */
279 int data
; /* Some use extra data */
282 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
286 /*************************************************
287 * Print character string *
288 *************************************************/
290 /* Character string printing function for debugging.
294 length number of bytes
301 pchars(const pcre_uchar
*p
, int length
, FILE *f
)
306 if (isprint(c
= *(p
++)))
309 fprintf(f
, "\\x%02x", c
);
316 /*************************************************
317 * Execute a Regular Expression - DFA engine *
318 *************************************************/
320 /* This internal function applies a compiled pattern to a subject string,
321 starting at a given point, using a DFA engine. This function is called from the
322 external one, possibly multiple times if the pattern is not anchored. The
323 function calls itself recursively for some kinds of subpattern.
326 md the match_data block with fixed information
327 this_start_code the opening bracket of this subexpression's code
328 current_subject where we currently are in the subject string
329 start_offset start offset in the subject string
330 offsets vector to contain the matching string offsets
331 offsetcount size of same
332 workspace vector of workspace
334 rlevel function call recursion level
336 Returns: > 0 => number of match offset pairs placed in offsets
337 = 0 => offsets overflowed; longest matches are present
338 -1 => failed to match
339 < -1 => some kind of unexpected problem
341 The following macros are used for adding states to the two state vectors (one
342 for the current character, one for the following character). */
344 #define ADD_ACTIVE(x,y) \
345 if (active_count++ < wscount) \
347 next_active_state->offset = (x); \
348 next_active_state->count = (y); \
349 next_active_state++; \
350 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
352 else return PCRE_ERROR_DFA_WSSIZE
354 #define ADD_ACTIVE_DATA(x,y,z) \
355 if (active_count++ < wscount) \
357 next_active_state->offset = (x); \
358 next_active_state->count = (y); \
359 next_active_state->data = (z); \
360 next_active_state++; \
361 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
363 else return PCRE_ERROR_DFA_WSSIZE
365 #define ADD_NEW(x,y) \
366 if (new_count++ < wscount) \
368 next_new_state->offset = (x); \
369 next_new_state->count = (y); \
371 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
373 else return PCRE_ERROR_DFA_WSSIZE
375 #define ADD_NEW_DATA(x,y,z) \
376 if (new_count++ < wscount) \
378 next_new_state->offset = (x); \
379 next_new_state->count = (y); \
380 next_new_state->data = (z); \
382 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
383 (x), (y), (z), __LINE__)); \
385 else return PCRE_ERROR_DFA_WSSIZE
387 /* And now, here is the code */
392 const pcre_uchar
*this_start_code
,
393 const pcre_uchar
*current_subject
,
401 stateblock
*active_states
, *new_states
, *temp_states
;
402 stateblock
*next_active_state
, *next_new_state
;
404 const pcre_uint8
*ctypes
, *lcc
, *fcc
;
405 const pcre_uchar
*ptr
;
406 const pcre_uchar
*end_code
, *first_op
;
408 dfa_recursion_info new_recursive
;
410 int active_count
, new_count
, match_count
;
412 /* Some fields in the md block are frequently referenced, so we load them into
413 independent variables in the hope that this will perform better. */
415 const pcre_uchar
*start_subject
= md
->start_subject
;
416 const pcre_uchar
*end_subject
= md
->end_subject
;
417 const pcre_uchar
*start_code
= md
->start_code
;
420 BOOL utf
= (md
->poptions
& PCRE_UTF8
) != 0;
425 BOOL reset_could_continue
= FALSE
;
431 wscount
= (wscount
- (wscount
% (INTS_PER_STATEBLOCK
* 2))) /
432 (2 * INTS_PER_STATEBLOCK
);
434 DPRINTF(("\n%.*s---------------------\n"
435 "%.*sCall to internal_dfa_exec f=%d\n",
436 rlevel
*2-2, SP
, rlevel
*2-2, SP
, rlevel
));
438 ctypes
= md
->tables
+ ctypes_offset
;
439 lcc
= md
->tables
+ lcc_offset
;
440 fcc
= md
->tables
+ fcc_offset
;
442 match_count
= PCRE_ERROR_NOMATCH
; /* A negative number */
444 active_states
= (stateblock
*)(workspace
+ 2);
445 next_new_state
= new_states
= active_states
+ wscount
;
448 first_op
= this_start_code
+ 1 + LINK_SIZE
+
449 ((*this_start_code
== OP_CBRA
|| *this_start_code
== OP_SCBRA
||
450 *this_start_code
== OP_CBRAPOS
|| *this_start_code
== OP_SCBRAPOS
)
453 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
454 the alternative states onto the list, and find out where the end is. This
455 makes is possible to use this function recursively, when we want to stop at a
456 matching internal ket rather than at the end.
458 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
459 a backward assertion. In that case, we have to find out the maximum amount to
460 move back, and set up each alternative appropriately. */
462 if (*first_op
== OP_REVERSE
)
467 end_code
= this_start_code
;
470 int back
= GET(end_code
, 2+LINK_SIZE
);
471 if (back
> max_back
) max_back
= back
;
472 end_code
+= GET(end_code
, 1);
474 while (*end_code
== OP_ALT
);
476 /* If we can't go back the amount required for the longest lookbehind
477 pattern, go back as far as we can; some alternatives may still be viable. */
480 /* In character mode we have to step back character by character */
484 for (gone_back
= 0; gone_back
< max_back
; gone_back
++)
486 if (current_subject
<= start_subject
) break;
488 ACROSSCHAR(current_subject
> start_subject
, *current_subject
, current_subject
--);
494 /* In byte-mode we can do this quickly. */
497 gone_back
= (current_subject
- max_back
< start_subject
)?
498 (int)(current_subject
- start_subject
) : max_back
;
499 current_subject
-= gone_back
;
502 /* Save the earliest consulted character */
504 if (current_subject
< md
->start_used_ptr
)
505 md
->start_used_ptr
= current_subject
;
507 /* Now we can process the individual branches. */
509 end_code
= this_start_code
;
512 int back
= GET(end_code
, 2+LINK_SIZE
);
513 if (back
<= gone_back
)
515 int bstate
= (int)(end_code
- start_code
+ 2 + 2*LINK_SIZE
);
516 ADD_NEW_DATA(-bstate
, 0, gone_back
- back
);
518 end_code
+= GET(end_code
, 1);
520 while (*end_code
== OP_ALT
);
523 /* This is the code for a "normal" subpattern (not a backward assertion). The
524 start of a whole pattern is always one of these. If we are at the top level,
525 we may be asked to restart matching from the same point that we reached for a
526 previous partial match. We still have to scan through the top-level branches to
527 find the end state. */
531 end_code
= this_start_code
;
535 if (rlevel
== 1 && (md
->moptions
& PCRE_DFA_RESTART
) != 0)
537 do { end_code
+= GET(end_code
, 1); } while (*end_code
== OP_ALT
);
538 new_count
= workspace
[1];
540 memcpy(new_states
, active_states
, new_count
* sizeof(stateblock
));
547 int length
= 1 + LINK_SIZE
+
548 ((*this_start_code
== OP_CBRA
|| *this_start_code
== OP_SCBRA
||
549 *this_start_code
== OP_CBRAPOS
|| *this_start_code
== OP_SCBRAPOS
)
553 ADD_NEW((int)(end_code
- start_code
+ length
), 0);
554 end_code
+= GET(end_code
, 1);
555 length
= 1 + LINK_SIZE
;
557 while (*end_code
== OP_ALT
);
561 workspace
[0] = 0; /* Bit indicating which vector is current */
563 DPRINTF(("%.*sEnd state = %d\n", rlevel
*2-2, SP
, (int)(end_code
- start_code
)));
565 /* Loop for scanning the subject */
567 ptr
= current_subject
;
574 BOOL partial_newline
= FALSE
;
575 BOOL could_continue
= reset_could_continue
;
576 reset_could_continue
= FALSE
;
578 /* Make the new state list into the active state list and empty the
581 temp_states
= active_states
;
582 active_states
= new_states
;
583 new_states
= temp_states
;
584 active_count
= new_count
;
587 workspace
[0] ^= 1; /* Remember for the restarting feature */
588 workspace
[1] = active_count
;
591 printf("%.*sNext character: rest of subject = \"", rlevel
*2-2, SP
);
592 pchars(ptr
, STRLEN_UC(ptr
), stdout
);
595 printf("%.*sActive states: ", rlevel
*2-2, SP
);
596 for (i
= 0; i
< active_count
; i
++)
597 printf("%d/%d ", active_states
[i
].offset
, active_states
[i
].count
);
601 /* Set the pointers for adding new states */
603 next_active_state
= active_states
+ active_count
;
604 next_new_state
= new_states
;
606 /* Load the current character from the subject outside the loop, as many
607 different states may want to look at it, and we assume that at least one
610 if (ptr
< end_subject
)
612 clen
= 1; /* Number of data items in the character */
614 if (utf
) { GETCHARLEN(c
, ptr
, clen
); } else
615 #endif /* SUPPORT_UTF */
620 clen
= 0; /* This indicates the end of the subject */
621 c
= NOTACHAR
; /* This value should never actually be used */
624 /* Scan up the active states and act on each one. The result of an action
625 may be to add more states to the currently active list (e.g. on hitting a
626 parenthesis) or it may be to put states on the new list, for considering
627 when we move the character pointer on. */
629 for (i
= 0; i
< active_count
; i
++)
631 stateblock
*current_state
= active_states
+ i
;
632 BOOL caseless
= FALSE
;
633 const pcre_uchar
*code
;
634 int state_offset
= current_state
->offset
;
635 int count
, codevalue
, rrc
;
638 printf ("%.*sProcessing state %d c=", rlevel
*2-2, SP
, state_offset
);
639 if (clen
== 0) printf("EOL\n");
640 else if (c
> 32 && c
< 127) printf("'%c'\n", c
);
641 else printf("0x%02x\n", c
);
644 /* A negative offset is a special case meaning "hold off going to this
645 (negated) state until the number of characters in the data field have
646 been skipped". If the could_continue flag was passed over from a previous
647 state, arrange for it to passed on. */
649 if (state_offset
< 0)
651 if (current_state
->data
> 0)
653 DPRINTF(("%.*sSkipping this character\n", rlevel
*2-2, SP
));
654 ADD_NEW_DATA(state_offset
, current_state
->count
,
655 current_state
->data
- 1);
656 if (could_continue
) reset_could_continue
= TRUE
;
661 current_state
->offset
= state_offset
= -state_offset
;
665 /* Check for a duplicate state with the same count, and skip if found.
666 See the note at the head of this module about the possibility of improving
669 for (j
= 0; j
< i
; j
++)
671 if (active_states
[j
].offset
== state_offset
&&
672 active_states
[j
].count
== current_state
->count
)
674 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel
*2-2, SP
));
675 goto NEXT_ACTIVE_STATE
;
679 /* The state offset is the offset to the opcode */
681 code
= start_code
+ state_offset
;
684 /* If this opcode inspects a character, but we are at the end of the
685 subject, remember the fact for use when testing for a partial match. */
687 if (clen
== 0 && poptable
[codevalue
] != 0)
688 could_continue
= TRUE
;
690 /* If this opcode is followed by an inline character, load it. It is
691 tempting to test for the presence of a subject character here, but that
692 is wrong, because sometimes zero repetitions of the subject are
695 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
696 argument that is not a data character - but is always one byte long because
697 the values are small. We have to take special action to deal with \P, \p,
698 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
699 these ones to new opcodes. */
701 if (coptable
[codevalue
] > 0)
705 if (utf
) { GETCHARLEN(d
, (code
+ coptable
[codevalue
]), dlen
); } else
706 #endif /* SUPPORT_UTF */
707 d
= code
[coptable
[codevalue
]];
708 if (codevalue
>= OP_TYPESTAR
)
712 case OP_ANYBYTE
: return PCRE_ERROR_DFA_UITEM
;
714 case OP_PROP
: codevalue
+= OP_PROP_EXTRA
; break;
715 case OP_ANYNL
: codevalue
+= OP_ANYNL_EXTRA
; break;
716 case OP_EXTUNI
: codevalue
+= OP_EXTUNI_EXTRA
; break;
718 case OP_HSPACE
: codevalue
+= OP_HSPACE_EXTRA
; break;
720 case OP_VSPACE
: codevalue
+= OP_VSPACE_EXTRA
; break;
727 dlen
= 0; /* Not strictly necessary, but compilers moan */
728 d
= NOTACHAR
; /* if these variables are not set. */
732 /* Now process the individual opcodes */
736 /* ========================================================================== */
737 /* These cases are never obeyed. This is a fudge that causes a compile-
738 time error if the vectors coptable or poptable, which are indexed by
739 opcode, are not the correct length. It seems to be the only way to do
740 such a check at compile time, as the sizeof() operator does not work
741 in the C preprocessor. */
743 case OP_TABLE_LENGTH
:
744 case OP_TABLE_LENGTH
+
745 ((sizeof(coptable
) == OP_TABLE_LENGTH
) &&
746 (sizeof(poptable
) == OP_TABLE_LENGTH
)):
749 /* ========================================================================== */
750 /* Reached a closing bracket. If not at the end of the pattern, carry
751 on with the next opcode. For repeating opcodes, also add the repeat
752 state. Note that KETRPOS will always be encountered at the end of the
753 subpattern, because the possessive subpattern repeats are always handled
754 using recursive calls. Thus, it never adds any new states.
756 At the end of the (sub)pattern, unless we have an empty string and
757 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
758 start of the subject, save the match data, shifting up all previous
759 matches so we always have the longest first. */
765 if (code
!= end_code
)
767 ADD_ACTIVE(state_offset
+ 1 + LINK_SIZE
, 0);
768 if (codevalue
!= OP_KET
)
770 ADD_ACTIVE(state_offset
- GET(code
, 1), 0);
775 if (ptr
> current_subject
||
776 ((md
->moptions
& PCRE_NOTEMPTY
) == 0 &&
777 ((md
->moptions
& PCRE_NOTEMPTY_ATSTART
) == 0 ||
778 current_subject
> start_subject
+ md
->start_offset
)))
780 if (match_count
< 0) match_count
= (offsetcount
>= 2)? 1 : 0;
781 else if (match_count
> 0 && ++match_count
* 2 > offsetcount
)
783 count
= ((match_count
== 0)? offsetcount
: match_count
* 2) - 2;
784 if (count
> 0) memmove(offsets
+ 2, offsets
, count
* sizeof(int));
785 if (offsetcount
>= 2)
787 offsets
[0] = (int)(current_subject
- start_subject
);
788 offsets
[1] = (int)(ptr
- start_subject
);
789 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel
*2-2, SP
,
790 offsets
[1] - offsets
[0], (char *)current_subject
));
792 if ((md
->moptions
& PCRE_DFA_SHORTEST
) != 0)
794 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
795 "%.*s---------------------\n\n", rlevel
*2-2, SP
, rlevel
,
796 match_count
, rlevel
*2-2, SP
));
803 /* ========================================================================== */
804 /* These opcodes add to the current list of states without looking
805 at the current character. */
807 /*-----------------------------------------------------------------*/
809 do { code
+= GET(code
, 1); } while (*code
== OP_ALT
);
810 ADD_ACTIVE((int)(code
- start_code
), 0);
813 /*-----------------------------------------------------------------*/
818 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
), 0);
819 code
+= GET(code
, 1);
821 while (*code
== OP_ALT
);
824 /*-----------------------------------------------------------------*/
827 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
+ IMM2_SIZE
), 0);
828 code
+= GET(code
, 1);
829 while (*code
== OP_ALT
)
831 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
), 0);
832 code
+= GET(code
, 1);
836 /*-----------------------------------------------------------------*/
839 ADD_ACTIVE(state_offset
+ 1, 0);
840 code
+= 1 + GET(code
, 2);
841 while (*code
== OP_ALT
) code
+= GET(code
, 1);
842 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
), 0);
845 /*-----------------------------------------------------------------*/
847 code
+= 1 + GET(code
, 2);
848 while (*code
== OP_ALT
) code
+= GET(code
, 1);
849 ADD_ACTIVE((int)(code
- start_code
+ 1 + LINK_SIZE
), 0);
852 /*-----------------------------------------------------------------*/
854 if (ptr
== start_subject
&& (md
->moptions
& PCRE_NOTBOL
) == 0)
855 { ADD_ACTIVE(state_offset
+ 1, 0); }
858 /*-----------------------------------------------------------------*/
860 if ((ptr
== start_subject
&& (md
->moptions
& PCRE_NOTBOL
) == 0) ||
861 (ptr
!= end_subject
&& WAS_NEWLINE(ptr
)))
862 { ADD_ACTIVE(state_offset
+ 1, 0); }
865 /*-----------------------------------------------------------------*/
867 if (ptr
>= end_subject
)
869 if ((md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
870 could_continue
= TRUE
;
871 else { ADD_ACTIVE(state_offset
+ 1, 0); }
875 /*-----------------------------------------------------------------*/
877 if (ptr
== start_subject
) { ADD_ACTIVE(state_offset
+ 1, 0); }
880 /*-----------------------------------------------------------------*/
882 if (ptr
== start_subject
+ start_offset
) { ADD_ACTIVE(state_offset
+ 1, 0); }
886 /* ========================================================================== */
887 /* These opcodes inspect the next subject character, and sometimes
888 the previous one as well, but do not have an argument. The variable
889 clen contains the length of the current character and is zero if we are
890 at the end of the subject. */
892 /*-----------------------------------------------------------------*/
894 if (clen
> 0 && !IS_NEWLINE(ptr
))
896 if (ptr
+ 1 >= md
->end_subject
&&
897 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
898 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
899 NLBLOCK
->nllen
== 2 &&
902 could_continue
= partial_newline
= TRUE
;
906 ADD_NEW(state_offset
+ 1, 0);
911 /*-----------------------------------------------------------------*/
914 { ADD_NEW(state_offset
+ 1, 0); }
917 /*-----------------------------------------------------------------*/
919 if (clen
== 0 && (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
920 could_continue
= TRUE
;
921 else if (clen
== 0 || (IS_NEWLINE(ptr
) && ptr
== end_subject
- md
->nllen
))
922 { ADD_ACTIVE(state_offset
+ 1, 0); }
925 /*-----------------------------------------------------------------*/
927 if ((md
->moptions
& PCRE_NOTEOL
) == 0)
929 if (clen
== 0 && (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
930 could_continue
= TRUE
;
931 else if (clen
== 0 ||
932 ((md
->poptions
& PCRE_DOLLAR_ENDONLY
) == 0 && IS_NEWLINE(ptr
) &&
933 (ptr
== end_subject
- md
->nllen
)
935 { ADD_ACTIVE(state_offset
+ 1, 0); }
936 else if (ptr
+ 1 >= md
->end_subject
&&
937 (md
->moptions
& (PCRE_PARTIAL_HARD
|PCRE_PARTIAL_SOFT
)) != 0 &&
938 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
939 NLBLOCK
->nllen
== 2 &&
942 if ((md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
944 reset_could_continue
= TRUE
;
945 ADD_NEW_DATA(-(state_offset
+ 1), 0, 1);
947 else could_continue
= partial_newline
= TRUE
;
952 /*-----------------------------------------------------------------*/
954 if ((md
->moptions
& PCRE_NOTEOL
) == 0)
956 if (clen
== 0 && (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
957 could_continue
= TRUE
;
958 else if (clen
== 0 ||
959 ((md
->poptions
& PCRE_DOLLAR_ENDONLY
) == 0 && IS_NEWLINE(ptr
)))
960 { ADD_ACTIVE(state_offset
+ 1, 0); }
961 else if (ptr
+ 1 >= md
->end_subject
&&
962 (md
->moptions
& (PCRE_PARTIAL_HARD
|PCRE_PARTIAL_SOFT
)) != 0 &&
963 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
964 NLBLOCK
->nllen
== 2 &&
967 if ((md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
969 reset_could_continue
= TRUE
;
970 ADD_NEW_DATA(-(state_offset
+ 1), 0, 1);
972 else could_continue
= partial_newline
= TRUE
;
975 else if (IS_NEWLINE(ptr
))
976 { ADD_ACTIVE(state_offset
+ 1, 0); }
979 /*-----------------------------------------------------------------*/
984 if (clen
> 0 && c
< 256 &&
985 ((ctypes
[c
] & toptable1
[codevalue
]) ^ toptable2
[codevalue
]) != 0)
986 { ADD_NEW(state_offset
+ 1, 0); }
989 /*-----------------------------------------------------------------*/
991 case OP_NOT_WHITESPACE
:
992 case OP_NOT_WORDCHAR
:
993 if (clen
> 0 && (c
>= 256 ||
994 ((ctypes
[c
] & toptable1
[codevalue
]) ^ toptable2
[codevalue
]) != 0))
995 { ADD_NEW(state_offset
+ 1, 0); }
998 /*-----------------------------------------------------------------*/
999 case OP_WORD_BOUNDARY
:
1000 case OP_NOT_WORD_BOUNDARY
:
1002 int left_word
, right_word
;
1004 if (ptr
> start_subject
)
1006 const pcre_uchar
*temp
= ptr
- 1;
1007 if (temp
< md
->start_used_ptr
) md
->start_used_ptr
= temp
;
1009 if (utf
) { BACKCHAR(temp
); }
1011 GETCHARTEST(d
, temp
);
1013 if ((md
->poptions
& PCRE_UCP
) != 0)
1015 if (d
== '_') left_word
= TRUE
; else
1017 int cat
= UCD_CATEGORY(d
);
1018 left_word
= (cat
== ucp_L
|| cat
== ucp_N
);
1023 left_word
= d
< 256 && (ctypes
[d
] & ctype_word
) != 0;
1025 else left_word
= FALSE
;
1030 if ((md
->poptions
& PCRE_UCP
) != 0)
1032 if (c
== '_') right_word
= TRUE
; else
1034 int cat
= UCD_CATEGORY(c
);
1035 right_word
= (cat
== ucp_L
|| cat
== ucp_N
);
1040 right_word
= c
< 256 && (ctypes
[c
] & ctype_word
) != 0;
1042 else right_word
= FALSE
;
1044 if ((left_word
== right_word
) == (codevalue
== OP_NOT_WORD_BOUNDARY
))
1045 { ADD_ACTIVE(state_offset
+ 1, 0); }
1050 /*-----------------------------------------------------------------*/
1051 /* Check the next character by Unicode property. We will get here only
1052 if the support is in the binary; otherwise a compile-time error occurs.
1061 const pcre_uint8 chartype
= UCD_CHARTYPE(c
);
1069 OK
= chartype
== ucp_Lu
|| chartype
== ucp_Ll
||
1074 OK
= PRIV(ucp_gentype
)[chartype
] == code
[2];
1078 OK
= chartype
== code
[2];
1082 OK
= UCD_SCRIPT(c
) == code
[2];
1085 /* These are specials for combination cases. */
1088 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1089 PRIV(ucp_gentype
)[chartype
] == ucp_N
;
1092 case PT_SPACE
: /* Perl space */
1093 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1094 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
;
1097 case PT_PXSPACE
: /* POSIX space */
1098 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1099 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
1100 c
== CHAR_FF
|| c
== CHAR_CR
;
1104 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1105 PRIV(ucp_gentype
)[chartype
] == ucp_N
||
1106 c
== CHAR_UNDERSCORE
;
1109 /* Should never occur, but keep compilers from grumbling. */
1112 OK
= codevalue
!= OP_PROP
;
1116 if (OK
== (codevalue
== OP_PROP
)) { ADD_NEW(state_offset
+ 3, 0); }
1123 /* ========================================================================== */
1124 /* These opcodes likewise inspect the subject character, but have an
1125 argument that is not a data character. It is one of these opcodes:
1126 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1127 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130 case OP_TYPEMINPLUS
:
1131 case OP_TYPEPOSPLUS
:
1132 count
= current_state
->count
; /* Already matched */
1133 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1136 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1137 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1138 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1139 NLBLOCK
->nllen
== 2 &&
1140 c
== NLBLOCK
->nl
[0])
1142 could_continue
= partial_newline
= TRUE
;
1144 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1146 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1147 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1149 if (count
> 0 && codevalue
== OP_TYPEPOSPLUS
)
1151 active_count
--; /* Remove non-match possibility */
1152 next_active_state
--;
1155 ADD_NEW(state_offset
, count
);
1160 /*-----------------------------------------------------------------*/
1162 case OP_TYPEMINQUERY
:
1163 case OP_TYPEPOSQUERY
:
1164 ADD_ACTIVE(state_offset
+ 2, 0);
1167 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1168 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1169 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1170 NLBLOCK
->nllen
== 2 &&
1171 c
== NLBLOCK
->nl
[0])
1173 could_continue
= partial_newline
= TRUE
;
1175 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1177 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1178 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1180 if (codevalue
== OP_TYPEPOSQUERY
)
1182 active_count
--; /* Remove non-match possibility */
1183 next_active_state
--;
1185 ADD_NEW(state_offset
+ 2, 0);
1190 /*-----------------------------------------------------------------*/
1192 case OP_TYPEMINSTAR
:
1193 case OP_TYPEPOSSTAR
:
1194 ADD_ACTIVE(state_offset
+ 2, 0);
1197 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1198 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1199 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1200 NLBLOCK
->nllen
== 2 &&
1201 c
== NLBLOCK
->nl
[0])
1203 could_continue
= partial_newline
= TRUE
;
1205 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1207 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1208 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1210 if (codevalue
== OP_TYPEPOSSTAR
)
1212 active_count
--; /* Remove non-match possibility */
1213 next_active_state
--;
1215 ADD_NEW(state_offset
, 0);
1220 /*-----------------------------------------------------------------*/
1222 count
= current_state
->count
; /* Number already matched */
1225 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1226 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1227 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1228 NLBLOCK
->nllen
== 2 &&
1229 c
== NLBLOCK
->nl
[0])
1231 could_continue
= partial_newline
= TRUE
;
1233 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1235 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1236 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1238 if (++count
>= GET2(code
, 1))
1239 { ADD_NEW(state_offset
+ 1 + IMM2_SIZE
+ 1, 0); }
1241 { ADD_NEW(state_offset
, count
); }
1246 /*-----------------------------------------------------------------*/
1248 case OP_TYPEMINUPTO
:
1249 case OP_TYPEPOSUPTO
:
1250 ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0);
1251 count
= current_state
->count
; /* Number already matched */
1254 if (d
== OP_ANY
&& ptr
+ 1 >= md
->end_subject
&&
1255 (md
->moptions
& (PCRE_PARTIAL_HARD
)) != 0 &&
1256 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
1257 NLBLOCK
->nllen
== 2 &&
1258 c
== NLBLOCK
->nl
[0])
1260 could_continue
= partial_newline
= TRUE
;
1262 else if ((c
>= 256 && d
!= OP_DIGIT
&& d
!= OP_WHITESPACE
&& d
!= OP_WORDCHAR
) ||
1264 (d
!= OP_ANY
|| !IS_NEWLINE(ptr
)) &&
1265 ((ctypes
[c
] & toptable1
[d
]) ^ toptable2
[d
]) != 0))
1267 if (codevalue
== OP_TYPEPOSUPTO
)
1269 active_count
--; /* Remove non-match possibility */
1270 next_active_state
--;
1272 if (++count
>= GET2(code
, 1))
1273 { ADD_NEW(state_offset
+ 2 + IMM2_SIZE
, 0); }
1275 { ADD_NEW(state_offset
, count
); }
1280 /* ========================================================================== */
1281 /* These are virtual opcodes that are used when something like
1282 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1283 argument. It keeps the code above fast for the other cases. The argument
1284 is in the d variable. */
1287 case OP_PROP_EXTRA
+ OP_TYPEPLUS
:
1288 case OP_PROP_EXTRA
+ OP_TYPEMINPLUS
:
1289 case OP_PROP_EXTRA
+ OP_TYPEPOSPLUS
:
1290 count
= current_state
->count
; /* Already matched */
1291 if (count
> 0) { ADD_ACTIVE(state_offset
+ 4, 0); }
1295 const pcre_uint8 chartype
= UCD_CHARTYPE(c
);
1303 OK
= chartype
== ucp_Lu
|| chartype
== ucp_Ll
||
1308 OK
= PRIV(ucp_gentype
)[chartype
] == code
[3];
1312 OK
= chartype
== code
[3];
1316 OK
= UCD_SCRIPT(c
) == code
[3];
1319 /* These are specials for combination cases. */
1322 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1323 PRIV(ucp_gentype
)[chartype
] == ucp_N
;
1326 case PT_SPACE
: /* Perl space */
1327 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1328 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
;
1331 case PT_PXSPACE
: /* POSIX space */
1332 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1333 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
1334 c
== CHAR_FF
|| c
== CHAR_CR
;
1338 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1339 PRIV(ucp_gentype
)[chartype
] == ucp_N
||
1340 c
== CHAR_UNDERSCORE
;
1343 /* Should never occur, but keep compilers from grumbling. */
1346 OK
= codevalue
!= OP_PROP
;
1350 if (OK
== (d
== OP_PROP
))
1352 if (count
> 0 && codevalue
== OP_PROP_EXTRA
+ OP_TYPEPOSPLUS
)
1354 active_count
--; /* Remove non-match possibility */
1355 next_active_state
--;
1358 ADD_NEW(state_offset
, count
);
1363 /*-----------------------------------------------------------------*/
1364 case OP_EXTUNI_EXTRA
+ OP_TYPEPLUS
:
1365 case OP_EXTUNI_EXTRA
+ OP_TYPEMINPLUS
:
1366 case OP_EXTUNI_EXTRA
+ OP_TYPEPOSPLUS
:
1367 count
= current_state
->count
; /* Already matched */
1368 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1369 if (clen
> 0 && UCD_CATEGORY(c
) != ucp_M
)
1371 const pcre_uchar
*nptr
= ptr
+ clen
;
1373 if (count
> 0 && codevalue
== OP_EXTUNI_EXTRA
+ OP_TYPEPOSPLUS
)
1375 active_count
--; /* Remove non-match possibility */
1376 next_active_state
--;
1378 while (nptr
< end_subject
)
1382 GETCHARLEN(nd
, nptr
, ndlen
);
1383 if (UCD_CATEGORY(nd
) != ucp_M
) break;
1388 ADD_NEW_DATA(-state_offset
, count
, ncount
);
1393 /*-----------------------------------------------------------------*/
1394 case OP_ANYNL_EXTRA
+ OP_TYPEPLUS
:
1395 case OP_ANYNL_EXTRA
+ OP_TYPEMINPLUS
:
1396 case OP_ANYNL_EXTRA
+ OP_TYPEPOSPLUS
:
1397 count
= current_state
->count
; /* Already matched */
1398 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1409 if ((md
->moptions
& PCRE_BSR_ANYCRLF
) != 0) break;
1413 if (ptr
+ 1 < end_subject
&& ptr
[1] == 0x0a) ncount
= 1;
1418 if (count
> 0 && codevalue
== OP_ANYNL_EXTRA
+ OP_TYPEPOSPLUS
)
1420 active_count
--; /* Remove non-match possibility */
1421 next_active_state
--;
1424 ADD_NEW_DATA(-state_offset
, count
, ncount
);
1433 /*-----------------------------------------------------------------*/
1434 case OP_VSPACE_EXTRA
+ OP_TYPEPLUS
:
1435 case OP_VSPACE_EXTRA
+ OP_TYPEMINPLUS
:
1436 case OP_VSPACE_EXTRA
+ OP_TYPEPOSPLUS
:
1437 count
= current_state
->count
; /* Already matched */
1438 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1459 if (OK
== (d
== OP_VSPACE
))
1461 if (count
> 0 && codevalue
== OP_VSPACE_EXTRA
+ OP_TYPEPOSPLUS
)
1463 active_count
--; /* Remove non-match possibility */
1464 next_active_state
--;
1467 ADD_NEW_DATA(-state_offset
, count
, 0);
1472 /*-----------------------------------------------------------------*/
1473 case OP_HSPACE_EXTRA
+ OP_TYPEPLUS
:
1474 case OP_HSPACE_EXTRA
+ OP_TYPEMINPLUS
:
1475 case OP_HSPACE_EXTRA
+ OP_TYPEPOSPLUS
:
1476 count
= current_state
->count
; /* Already matched */
1477 if (count
> 0) { ADD_ACTIVE(state_offset
+ 2, 0); }
1484 case 0x20: /* SPACE */
1485 case 0xa0: /* NBSP */
1486 case 0x1680: /* OGHAM SPACE MARK */
1487 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1488 case 0x2000: /* EN QUAD */
1489 case 0x2001: /* EM QUAD */
1490 case 0x2002: /* EN SPACE */
1491 case 0x2003: /* EM SPACE */
1492 case 0x2004: /* THREE-PER-EM SPACE */
1493 case 0x2005: /* FOUR-PER-EM SPACE */
1494 case 0x2006: /* SIX-PER-EM SPACE */
1495 case 0x2007: /* FIGURE SPACE */
1496 case 0x2008: /* PUNCTUATION SPACE */
1497 case 0x2009: /* THIN SPACE */
1498 case 0x200A: /* HAIR SPACE */
1499 case 0x202f: /* NARROW NO-BREAK SPACE */
1500 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1501 case 0x3000: /* IDEOGRAPHIC SPACE */
1510 if (OK
== (d
== OP_HSPACE
))
1512 if (count
> 0 && codevalue
== OP_HSPACE_EXTRA
+ OP_TYPEPOSPLUS
)
1514 active_count
--; /* Remove non-match possibility */
1515 next_active_state
--;
1518 ADD_NEW_DATA(-state_offset
, count
, 0);
1523 /*-----------------------------------------------------------------*/
1525 case OP_PROP_EXTRA
+ OP_TYPEQUERY
:
1526 case OP_PROP_EXTRA
+ OP_TYPEMINQUERY
:
1527 case OP_PROP_EXTRA
+ OP_TYPEPOSQUERY
:
1531 case OP_PROP_EXTRA
+ OP_TYPESTAR
:
1532 case OP_PROP_EXTRA
+ OP_TYPEMINSTAR
:
1533 case OP_PROP_EXTRA
+ OP_TYPEPOSSTAR
:
1538 ADD_ACTIVE(state_offset
+ 4, 0);
1542 const pcre_uint8 chartype
= UCD_CHARTYPE(c
);
1550 OK
= chartype
== ucp_Lu
|| chartype
== ucp_Ll
||
1555 OK
= PRIV(ucp_gentype
)[chartype
] == code
[3];
1559 OK
= chartype
== code
[3];
1563 OK
= UCD_SCRIPT(c
) == code
[3];
1566 /* These are specials for combination cases. */
1569 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1570 PRIV(ucp_gentype
)[chartype
] == ucp_N
;
1573 case PT_SPACE
: /* Perl space */
1574 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1575 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
;
1578 case PT_PXSPACE
: /* POSIX space */
1579 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1580 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
1581 c
== CHAR_FF
|| c
== CHAR_CR
;
1585 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1586 PRIV(ucp_gentype
)[chartype
] == ucp_N
||
1587 c
== CHAR_UNDERSCORE
;
1590 /* Should never occur, but keep compilers from grumbling. */
1593 OK
= codevalue
!= OP_PROP
;
1597 if (OK
== (d
== OP_PROP
))
1599 if (codevalue
== OP_PROP_EXTRA
+ OP_TYPEPOSSTAR
||
1600 codevalue
== OP_PROP_EXTRA
+ OP_TYPEPOSQUERY
)
1602 active_count
--; /* Remove non-match possibility */
1603 next_active_state
--;
1605 ADD_NEW(state_offset
+ count
, 0);
1610 /*-----------------------------------------------------------------*/
1611 case OP_EXTUNI_EXTRA
+ OP_TYPEQUERY
:
1612 case OP_EXTUNI_EXTRA
+ OP_TYPEMINQUERY
:
1613 case OP_EXTUNI_EXTRA
+ OP_TYPEPOSQUERY
:
1617 case OP_EXTUNI_EXTRA
+ OP_TYPESTAR
:
1618 case OP_EXTUNI_EXTRA
+ OP_TYPEMINSTAR
:
1619 case OP_EXTUNI_EXTRA
+ OP_TYPEPOSSTAR
:
1624 ADD_ACTIVE(state_offset
+ 2, 0);
1625 if (clen
> 0 && UCD_CATEGORY(c
) != ucp_M
)
1627 const pcre_uchar
*nptr
= ptr
+ clen
;
1629 if (codevalue
== OP_EXTUNI_EXTRA
+ OP_TYPEPOSSTAR
||
1630 codevalue
== OP_EXTUNI_EXTRA
+ OP_TYPEPOSQUERY
)
1632 active_count
--; /* Remove non-match possibility */
1633 next_active_state
--;
1635 while (nptr
< end_subject
)
1639 GETCHARLEN(nd
, nptr
, ndlen
);
1640 if (UCD_CATEGORY(nd
) != ucp_M
) break;
1644 ADD_NEW_DATA(-(state_offset
+ count
), 0, ncount
);
1649 /*-----------------------------------------------------------------*/
1650 case OP_ANYNL_EXTRA
+ OP_TYPEQUERY
:
1651 case OP_ANYNL_EXTRA
+ OP_TYPEMINQUERY
:
1652 case OP_ANYNL_EXTRA
+ OP_TYPEPOSQUERY
:
1656 case OP_ANYNL_EXTRA
+ OP_TYPESTAR
:
1657 case OP_ANYNL_EXTRA
+ OP_TYPEMINSTAR
:
1658 case OP_ANYNL_EXTRA
+ OP_TYPEPOSSTAR
:
1662 ADD_ACTIVE(state_offset
+ 2, 0);
1673 if ((md
->moptions
& PCRE_BSR_ANYCRLF
) != 0) break;
1677 if (ptr
+ 1 < end_subject
&& ptr
[1] == 0x0a) ncount
= 1;
1682 if (codevalue
== OP_ANYNL_EXTRA
+ OP_TYPEPOSSTAR
||
1683 codevalue
== OP_ANYNL_EXTRA
+ OP_TYPEPOSQUERY
)
1685 active_count
--; /* Remove non-match possibility */
1686 next_active_state
--;
1688 ADD_NEW_DATA(-(state_offset
+ count
), 0, ncount
);
1697 /*-----------------------------------------------------------------*/
1698 case OP_VSPACE_EXTRA
+ OP_TYPEQUERY
:
1699 case OP_VSPACE_EXTRA
+ OP_TYPEMINQUERY
:
1700 case OP_VSPACE_EXTRA
+ OP_TYPEPOSQUERY
:
1704 case OP_VSPACE_EXTRA
+ OP_TYPESTAR
:
1705 case OP_VSPACE_EXTRA
+ OP_TYPEMINSTAR
:
1706 case OP_VSPACE_EXTRA
+ OP_TYPEPOSSTAR
:
1710 ADD_ACTIVE(state_offset
+ 2, 0);
1730 if (OK
== (d
== OP_VSPACE
))
1732 if (codevalue
== OP_VSPACE_EXTRA
+ OP_TYPEPOSSTAR
||
1733 codevalue
== OP_VSPACE_EXTRA
+ OP_TYPEPOSQUERY
)
1735 active_count
--; /* Remove non-match possibility */
1736 next_active_state
--;
1738 ADD_NEW_DATA(-(state_offset
+ count
), 0, 0);
1743 /*-----------------------------------------------------------------*/
1744 case OP_HSPACE_EXTRA
+ OP_TYPEQUERY
:
1745 case OP_HSPACE_EXTRA
+ OP_TYPEMINQUERY
:
1746 case OP_HSPACE_EXTRA
+ OP_TYPEPOSQUERY
:
1750 case OP_HSPACE_EXTRA
+ OP_TYPESTAR
:
1751 case OP_HSPACE_EXTRA
+ OP_TYPEMINSTAR
:
1752 case OP_HSPACE_EXTRA
+ OP_TYPEPOSSTAR
:
1756 ADD_ACTIVE(state_offset
+ 2, 0);
1763 case 0x20: /* SPACE */
1764 case 0xa0: /* NBSP */
1765 case 0x1680: /* OGHAM SPACE MARK */
1766 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1767 case 0x2000: /* EN QUAD */
1768 case 0x2001: /* EM QUAD */
1769 case 0x2002: /* EN SPACE */
1770 case 0x2003: /* EM SPACE */
1771 case 0x2004: /* THREE-PER-EM SPACE */
1772 case 0x2005: /* FOUR-PER-EM SPACE */
1773 case 0x2006: /* SIX-PER-EM SPACE */
1774 case 0x2007: /* FIGURE SPACE */
1775 case 0x2008: /* PUNCTUATION SPACE */
1776 case 0x2009: /* THIN SPACE */
1777 case 0x200A: /* HAIR SPACE */
1778 case 0x202f: /* NARROW NO-BREAK SPACE */
1779 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1780 case 0x3000: /* IDEOGRAPHIC SPACE */
1789 if (OK
== (d
== OP_HSPACE
))
1791 if (codevalue
== OP_HSPACE_EXTRA
+ OP_TYPEPOSSTAR
||
1792 codevalue
== OP_HSPACE_EXTRA
+ OP_TYPEPOSQUERY
)
1794 active_count
--; /* Remove non-match possibility */
1795 next_active_state
--;
1797 ADD_NEW_DATA(-(state_offset
+ count
), 0, 0);
1802 /*-----------------------------------------------------------------*/
1804 case OP_PROP_EXTRA
+ OP_TYPEEXACT
:
1805 case OP_PROP_EXTRA
+ OP_TYPEUPTO
:
1806 case OP_PROP_EXTRA
+ OP_TYPEMINUPTO
:
1807 case OP_PROP_EXTRA
+ OP_TYPEPOSUPTO
:
1808 if (codevalue
!= OP_PROP_EXTRA
+ OP_TYPEEXACT
)
1809 { ADD_ACTIVE(state_offset
+ 1 + IMM2_SIZE
+ 3, 0); }
1810 count
= current_state
->count
; /* Number already matched */
1814 const pcre_uint8 chartype
= UCD_CHARTYPE(c
);
1815 switch(code
[1 + IMM2_SIZE
+ 1])
1822 OK
= chartype
== ucp_Lu
|| chartype
== ucp_Ll
||
1827 OK
= PRIV(ucp_gentype
)[chartype
] == code
[1 + IMM2_SIZE
+ 2];
1831 OK
= chartype
== code
[1 + IMM2_SIZE
+ 2];
1835 OK
= UCD_SCRIPT(c
) == code
[1 + IMM2_SIZE
+ 2];
1838 /* These are specials for combination cases. */
1841 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1842 PRIV(ucp_gentype
)[chartype
] == ucp_N
;
1845 case PT_SPACE
: /* Perl space */
1846 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1847 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
;
1850 case PT_PXSPACE
: /* POSIX space */
1851 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_Z
||
1852 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
1853 c
== CHAR_FF
|| c
== CHAR_CR
;
1857 OK
= PRIV(ucp_gentype
)[chartype
] == ucp_L
||
1858 PRIV(ucp_gentype
)[chartype
] == ucp_N
||
1859 c
== CHAR_UNDERSCORE
;
1862 /* Should never occur, but keep compilers from grumbling. */
1865 OK
= codevalue
!= OP_PROP
;
1869 if (OK
== (d
== OP_PROP
))
1871 if (codevalue
== OP_PROP_EXTRA
+ OP_TYPEPOSUPTO
)
1873 active_count
--; /* Remove non-match possibility */
1874 next_active_state
--;
1876 if (++count
>= GET2(code
, 1))
1877 { ADD_NEW(state_offset
+ 1 + IMM2_SIZE
+ 3, 0); }
1879 { ADD_NEW(state_offset
, count
); }
1884 /*-----------------------------------------------------------------*/
1885 case OP_EXTUNI_EXTRA
+ OP_TYPEEXACT
:
1886 case OP_EXTUNI_EXTRA
+ OP_TYPEUPTO
:
1887 case OP_EXTUNI_EXTRA
+ OP_TYPEMINUPTO
:
1888 case OP_EXTUNI_EXTRA
+ OP_TYPEPOSUPTO
:
1889 if (codevalue
!= OP_EXTUNI_EXTRA
+ OP_TYPEEXACT
)
1890 { ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0); }
1891 count
= current_state
->count
; /* Number already matched */
1892 if (clen
> 0 && UCD_CATEGORY(c
) != ucp_M
)
1894 const pcre_uchar
*nptr
= ptr
+ clen
;
1896 if (codevalue
== OP_EXTUNI_EXTRA
+ OP_TYPEPOSUPTO
)
1898 active_count
--; /* Remove non-match possibility */
1899 next_active_state
--;
1901 while (nptr
< end_subject
)
1905 GETCHARLEN(nd
, nptr
, ndlen
);
1906 if (UCD_CATEGORY(nd
) != ucp_M
) break;
1910 if (nptr
>= end_subject
&& (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
1911 reset_could_continue
= TRUE
;
1912 if (++count
>= GET2(code
, 1))
1913 { ADD_NEW_DATA(-(state_offset
+ 2 + IMM2_SIZE
), 0, ncount
); }
1915 { ADD_NEW_DATA(-state_offset
, count
, ncount
); }
1920 /*-----------------------------------------------------------------*/
1921 case OP_ANYNL_EXTRA
+ OP_TYPEEXACT
:
1922 case OP_ANYNL_EXTRA
+ OP_TYPEUPTO
:
1923 case OP_ANYNL_EXTRA
+ OP_TYPEMINUPTO
:
1924 case OP_ANYNL_EXTRA
+ OP_TYPEPOSUPTO
:
1925 if (codevalue
!= OP_ANYNL_EXTRA
+ OP_TYPEEXACT
)
1926 { ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0); }
1927 count
= current_state
->count
; /* Number already matched */
1938 if ((md
->moptions
& PCRE_BSR_ANYCRLF
) != 0) break;
1942 if (ptr
+ 1 < end_subject
&& ptr
[1] == 0x0a) ncount
= 1;
1947 if (codevalue
== OP_ANYNL_EXTRA
+ OP_TYPEPOSUPTO
)
1949 active_count
--; /* Remove non-match possibility */
1950 next_active_state
--;
1952 if (++count
>= GET2(code
, 1))
1953 { ADD_NEW_DATA(-(state_offset
+ 2 + IMM2_SIZE
), 0, ncount
); }
1955 { ADD_NEW_DATA(-state_offset
, count
, ncount
); }
1964 /*-----------------------------------------------------------------*/
1965 case OP_VSPACE_EXTRA
+ OP_TYPEEXACT
:
1966 case OP_VSPACE_EXTRA
+ OP_TYPEUPTO
:
1967 case OP_VSPACE_EXTRA
+ OP_TYPEMINUPTO
:
1968 case OP_VSPACE_EXTRA
+ OP_TYPEPOSUPTO
:
1969 if (codevalue
!= OP_VSPACE_EXTRA
+ OP_TYPEEXACT
)
1970 { ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0); }
1971 count
= current_state
->count
; /* Number already matched */
1991 if (OK
== (d
== OP_VSPACE
))
1993 if (codevalue
== OP_VSPACE_EXTRA
+ OP_TYPEPOSUPTO
)
1995 active_count
--; /* Remove non-match possibility */
1996 next_active_state
--;
1998 if (++count
>= GET2(code
, 1))
1999 { ADD_NEW_DATA(-(state_offset
+ 2 + IMM2_SIZE
), 0, 0); }
2001 { ADD_NEW_DATA(-state_offset
, count
, 0); }
2006 /*-----------------------------------------------------------------*/
2007 case OP_HSPACE_EXTRA
+ OP_TYPEEXACT
:
2008 case OP_HSPACE_EXTRA
+ OP_TYPEUPTO
:
2009 case OP_HSPACE_EXTRA
+ OP_TYPEMINUPTO
:
2010 case OP_HSPACE_EXTRA
+ OP_TYPEPOSUPTO
:
2011 if (codevalue
!= OP_HSPACE_EXTRA
+ OP_TYPEEXACT
)
2012 { ADD_ACTIVE(state_offset
+ 2 + IMM2_SIZE
, 0); }
2013 count
= current_state
->count
; /* Number already matched */
2020 case 0x20: /* SPACE */
2021 case 0xa0: /* NBSP */
2022 case 0x1680: /* OGHAM SPACE MARK */
2023 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2024 case 0x2000: /* EN QUAD */
2025 case 0x2001: /* EM QUAD */
2026 case 0x2002: /* EN SPACE */
2027 case 0x2003: /* EM SPACE */
2028 case 0x2004: /* THREE-PER-EM SPACE */
2029 case 0x2005: /* FOUR-PER-EM SPACE */
2030 case 0x2006: /* SIX-PER-EM SPACE */
2031 case 0x2007: /* FIGURE SPACE */
2032 case 0x2008: /* PUNCTUATION SPACE */
2033 case 0x2009: /* THIN SPACE */
2034 case 0x200A: /* HAIR SPACE */
2035 case 0x202f: /* NARROW NO-BREAK SPACE */
2036 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2037 case 0x3000: /* IDEOGRAPHIC SPACE */
2046 if (OK
== (d
== OP_HSPACE
))
2048 if (codevalue
== OP_HSPACE_EXTRA
+ OP_TYPEPOSUPTO
)
2050 active_count
--; /* Remove non-match possibility */
2051 next_active_state
--;
2053 if (++count
>= GET2(code
, 1))
2054 { ADD_NEW_DATA(-(state_offset
+ 2 + IMM2_SIZE
), 0, 0); }
2056 { ADD_NEW_DATA(-state_offset
, count
, 0); }
2061 /* ========================================================================== */
2062 /* These opcodes are followed by a character that is usually compared
2063 to the current subject character; it is loaded into d. We still get
2064 here even if there is no subject character, because in some cases zero
2065 repetitions are permitted. */
2067 /*-----------------------------------------------------------------*/
2069 if (clen
> 0 && c
== d
) { ADD_NEW(state_offset
+ dlen
+ 1, 0); }
2072 /*-----------------------------------------------------------------*/
2074 if (clen
== 0) break;
2079 if (c
== d
) { ADD_NEW(state_offset
+ dlen
+ 1, 0); } else
2081 unsigned int othercase
;
2085 /* If we have Unicode property support, we can use it to test the
2086 other case of the character. */
2088 othercase
= UCD_OTHERCASE(c
);
2090 othercase
= NOTACHAR
;
2093 if (d
== othercase
) { ADD_NEW(state_offset
+ dlen
+ 1, 0); }
2097 #endif /* SUPPORT_UTF */
2100 if (TABLE_GET(c
, lcc
, c
) == TABLE_GET(d
, lcc
, d
))
2101 { ADD_NEW(state_offset
+ 2, 0); }
2107 /*-----------------------------------------------------------------*/
2108 /* This is a tricky one because it can match more than one character.
2109 Find out how many characters to skip, and then set up a negative state
2110 to wait for them to pass before continuing. */
2113 if (clen
> 0 && UCD_CATEGORY(c
) != ucp_M
)
2115 const pcre_uchar
*nptr
= ptr
+ clen
;
2117 while (nptr
< end_subject
)
2120 GETCHARLEN(c
, nptr
, nclen
);
2121 if (UCD_CATEGORY(c
) != ucp_M
) break;
2125 if (nptr
>= end_subject
&& (md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
2126 reset_could_continue
= TRUE
;
2127 ADD_NEW_DATA(-(state_offset
+ 1), 0, ncount
);
2132 /*-----------------------------------------------------------------*/
2133 /* This is a tricky like EXTUNI because it too can match more than one
2134 character (when CR is followed by LF). In this case, set up a negative
2135 state to wait for one character to pass before continuing. */
2138 if (clen
> 0) switch(c
)
2145 if ((md
->moptions
& PCRE_BSR_ANYCRLF
) != 0) break;
2148 ADD_NEW(state_offset
+ 1, 0);
2152 if (ptr
+ 1 >= end_subject
)
2154 ADD_NEW(state_offset
+ 1, 0);
2155 if ((md
->moptions
& PCRE_PARTIAL_HARD
) != 0)
2156 reset_could_continue
= TRUE
;
2158 else if (ptr
[1] == 0x0a)
2160 ADD_NEW_DATA(-(state_offset
+ 1), 0, 1);
2164 ADD_NEW(state_offset
+ 1, 0);
2170 /*-----------------------------------------------------------------*/
2172 if (clen
> 0) switch(c
)
2184 ADD_NEW(state_offset
+ 1, 0);
2189 /*-----------------------------------------------------------------*/
2191 if (clen
> 0) switch(c
)
2200 ADD_NEW(state_offset
+ 1, 0);
2207 /*-----------------------------------------------------------------*/
2209 if (clen
> 0) switch(c
)
2212 case 0x20: /* SPACE */
2213 case 0xa0: /* NBSP */
2214 case 0x1680: /* OGHAM SPACE MARK */
2215 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2216 case 0x2000: /* EN QUAD */
2217 case 0x2001: /* EM QUAD */
2218 case 0x2002: /* EN SPACE */
2219 case 0x2003: /* EM SPACE */
2220 case 0x2004: /* THREE-PER-EM SPACE */
2221 case 0x2005: /* FOUR-PER-EM SPACE */
2222 case 0x2006: /* SIX-PER-EM SPACE */
2223 case 0x2007: /* FIGURE SPACE */
2224 case 0x2008: /* PUNCTUATION SPACE */
2225 case 0x2009: /* THIN SPACE */
2226 case 0x200A: /* HAIR SPACE */
2227 case 0x202f: /* NARROW NO-BREAK SPACE */
2228 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2229 case 0x3000: /* IDEOGRAPHIC SPACE */
2233 ADD_NEW(state_offset
+ 1, 0);
2238 /*-----------------------------------------------------------------*/
2240 if (clen
> 0) switch(c
)
2243 case 0x20: /* SPACE */
2244 case 0xa0: /* NBSP */
2245 case 0x1680: /* OGHAM SPACE MARK */
2246 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2247 case 0x2000: /* EN QUAD */
2248 case 0x2001: /* EM QUAD */
2249 case 0x2002: /* EN SPACE */
2250 case 0x2003: /* EM SPACE */
2251 case 0x2004: /* THREE-PER-EM SPACE */
2252 case 0x2005: /* FOUR-PER-EM SPACE */
2253 case 0x2006: /* SIX-PER-EM SPACE */
2254 case 0x2007: /* FIGURE SPACE */
2255 case 0x2008: /* PUNCTUATION SPACE */
2256 case 0x2009: /* THIN SPACE */
2257 case 0x200A: /* HAIR SPACE */
2258 case 0x202f: /* NARROW NO-BREAK SPACE */
2259 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2260 case 0x3000: /* IDEOGRAPHIC SPACE */
2261 ADD_NEW(state_offset
+ 1, 0);
2266 /*-----------------------------------------------------------------*/
2267 /* Match a negated single character casefully. */
2270 if (clen
> 0 && c
!= d
) { ADD_NEW(state_offset
+ dlen
+ 1, 0); }
2273 /*-----------------------------------------------------------------*/
2274 /* Match a negated single character caselessly. */
2279 unsigned int otherd
;
2281 if (utf
&& d
>= 128)
2284 otherd
= UCD_OTHERCASE(d
);
2285 #endif /* SUPPORT_UCP */
2288 #endif /* SUPPORT_UTF */
2289 otherd
= TABLE_GET(d
, fcc
, d
);
2290 if (c
!= d
&& c
!= otherd
)
2291 { ADD_NEW(state_offset
+ dlen
+ 1, 0); }
2295 /*-----------------------------------------------------------------*/
2300 case OP_NOTMINPLUSI
:
2301 case OP_NOTPOSPLUSI
:
2303 codevalue
-= OP_STARI
- OP_STAR
;
2312 count
= current_state
->count
; /* Already matched */
2313 if (count
> 0) { ADD_ACTIVE(state_offset
+ dlen
+ 1, 0); }
2316 unsigned int otherd
= NOTACHAR
;
2320 if (utf
&& d
>= 128)
2323 otherd
= UCD_OTHERCASE(d
);
2324 #endif /* SUPPORT_UCP */
2327 #endif /* SUPPORT_UTF */
2328 otherd
= TABLE_GET(d
, fcc
, d
);
2330 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2333 (codevalue
== OP_POSPLUS
|| codevalue
== OP_NOTPOSPLUS
))
2335 active_count
--; /* Remove non-match possibility */
2336 next_active_state
--;
2339 ADD_NEW(state_offset
, count
);
2344 /*-----------------------------------------------------------------*/
2349 case OP_NOTMINQUERYI
:
2350 case OP_NOTPOSQUERYI
:
2352 codevalue
-= OP_STARI
- OP_STAR
;
2358 case OP_NOTMINQUERY
:
2359 case OP_NOTPOSQUERY
:
2360 ADD_ACTIVE(state_offset
+ dlen
+ 1, 0);
2363 unsigned int otherd
= NOTACHAR
;
2367 if (utf
&& d
>= 128)
2370 otherd
= UCD_OTHERCASE(d
);
2371 #endif /* SUPPORT_UCP */
2374 #endif /* SUPPORT_UTF */
2375 otherd
= TABLE_GET(d
, fcc
, d
);
2377 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2379 if (codevalue
== OP_POSQUERY
|| codevalue
== OP_NOTPOSQUERY
)
2381 active_count
--; /* Remove non-match possibility */
2382 next_active_state
--;
2384 ADD_NEW(state_offset
+ dlen
+ 1, 0);
2389 /*-----------------------------------------------------------------*/
2394 case OP_NOTMINSTARI
:
2395 case OP_NOTPOSSTARI
:
2397 codevalue
-= OP_STARI
- OP_STAR
;
2405 ADD_ACTIVE(state_offset
+ dlen
+ 1, 0);
2408 unsigned int otherd
= NOTACHAR
;
2412 if (utf
&& d
>= 128)
2415 otherd
= UCD_OTHERCASE(d
);
2416 #endif /* SUPPORT_UCP */
2419 #endif /* SUPPORT_UTF */
2420 otherd
= TABLE_GET(d
, fcc
, d
);
2422 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2424 if (codevalue
== OP_POSSTAR
|| codevalue
== OP_NOTPOSSTAR
)
2426 active_count
--; /* Remove non-match possibility */
2427 next_active_state
--;
2429 ADD_NEW(state_offset
, 0);
2434 /*-----------------------------------------------------------------*/
2438 codevalue
-= OP_STARI
- OP_STAR
;
2442 count
= current_state
->count
; /* Number already matched */
2445 unsigned int otherd
= NOTACHAR
;
2449 if (utf
&& d
>= 128)
2452 otherd
= UCD_OTHERCASE(d
);
2453 #endif /* SUPPORT_UCP */
2456 #endif /* SUPPORT_UTF */
2457 otherd
= TABLE_GET(d
, fcc
, d
);
2459 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2461 if (++count
>= GET2(code
, 1))
2462 { ADD_NEW(state_offset
+ dlen
+ 1 + IMM2_SIZE
, 0); }
2464 { ADD_NEW(state_offset
, count
); }
2469 /*-----------------------------------------------------------------*/
2474 case OP_NOTMINUPTOI
:
2475 case OP_NOTPOSUPTOI
:
2477 codevalue
-= OP_STARI
- OP_STAR
;
2485 ADD_ACTIVE(state_offset
+ dlen
+ 1 + IMM2_SIZE
, 0);
2486 count
= current_state
->count
; /* Number already matched */
2489 unsigned int otherd
= NOTACHAR
;
2493 if (utf
&& d
>= 128)
2496 otherd
= UCD_OTHERCASE(d
);
2497 #endif /* SUPPORT_UCP */
2500 #endif /* SUPPORT_UTF */
2501 otherd
= TABLE_GET(d
, fcc
, d
);
2503 if ((c
== d
|| c
== otherd
) == (codevalue
< OP_NOTSTAR
))
2505 if (codevalue
== OP_POSUPTO
|| codevalue
== OP_NOTPOSUPTO
)
2507 active_count
--; /* Remove non-match possibility */
2508 next_active_state
--;
2510 if (++count
>= GET2(code
, 1))
2511 { ADD_NEW(state_offset
+ dlen
+ 1 + IMM2_SIZE
, 0); }
2513 { ADD_NEW(state_offset
, count
); }
2519 /* ========================================================================== */
2520 /* These are the class-handling opcodes */
2526 BOOL isinclass
= FALSE
;
2527 int next_state_offset
;
2528 const pcre_uchar
*ecode
;
2530 /* For a simple class, there is always just a 32-byte table, and we
2531 can set isinclass from it. */
2533 if (codevalue
!= OP_XCLASS
)
2535 ecode
= code
+ 1 + (32 / sizeof(pcre_uchar
));
2538 isinclass
= (c
> 255)? (codevalue
== OP_NCLASS
) :
2539 ((((pcre_uint8
*)(code
+ 1))[c
/8] & (1 << (c
&7))) != 0);
2543 /* An extended class may have a table or a list of single characters,
2544 ranges, or both, and it may be positive or negative. There's a
2545 function that sorts all this out. */
2549 ecode
= code
+ GET(code
, 1);
2550 if (clen
> 0) isinclass
= PRIV(xclass
)(c
, code
+ 1 + LINK_SIZE
, utf
);
2553 /* At this point, isinclass is set for all kinds of class, and ecode
2554 points to the byte after the end of the class. If there is a
2555 quantifier, this is where it will be. */
2557 next_state_offset
= (int)(ecode
- start_code
);
2563 ADD_ACTIVE(next_state_offset
+ 1, 0);
2564 if (isinclass
) { ADD_NEW(state_offset
, 0); }
2569 count
= current_state
->count
; /* Already matched */
2570 if (count
> 0) { ADD_ACTIVE(next_state_offset
+ 1, 0); }
2571 if (isinclass
) { count
++; ADD_NEW(state_offset
, count
); }
2576 ADD_ACTIVE(next_state_offset
+ 1, 0);
2577 if (isinclass
) { ADD_NEW(next_state_offset
+ 1, 0); }
2582 count
= current_state
->count
; /* Already matched */
2583 if (count
>= GET2(ecode
, 1))
2584 { ADD_ACTIVE(next_state_offset
+ 1 + 2 * IMM2_SIZE
, 0); }
2587 int max
= GET2(ecode
, 1 + IMM2_SIZE
);
2588 if (++count
>= max
&& max
!= 0) /* Max 0 => no limit */
2589 { ADD_NEW(next_state_offset
+ 1 + 2 * IMM2_SIZE
, 0); }
2591 { ADD_NEW(state_offset
, count
); }
2596 if (isinclass
) { ADD_NEW(next_state_offset
, 0); }
2602 /* ========================================================================== */
2603 /* These are the opcodes for fancy brackets of various kinds. We have
2604 to use recursion in order to handle them. The "always failing" assertion
2605 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2606 though the other "backtracking verbs" are not supported. */
2609 forced_fail
++; /* Count FAILs for multiple states */
2615 case OP_ASSERTBACK_NOT
:
2618 int local_offsets
[2];
2619 int local_workspace
[1000];
2620 const pcre_uchar
*endasscode
= code
+ GET(code
, 1);
2622 while (*endasscode
== OP_ALT
) endasscode
+= GET(endasscode
, 1);
2624 rc
= internal_dfa_exec(
2625 md
, /* static match data */
2626 code
, /* this subexpression's code */
2627 ptr
, /* where we currently are */
2628 (int)(ptr
- start_subject
), /* start offset */
2629 local_offsets
, /* offset vector */
2630 sizeof(local_offsets
)/sizeof(int), /* size of same */
2631 local_workspace
, /* workspace vector */
2632 sizeof(local_workspace
)/sizeof(int), /* size of same */
2633 rlevel
); /* function recursion level */
2635 if (rc
== PCRE_ERROR_DFA_UITEM
) return rc
;
2636 if ((rc
>= 0) == (codevalue
== OP_ASSERT
|| codevalue
== OP_ASSERTBACK
))
2637 { ADD_ACTIVE((int)(endasscode
+ LINK_SIZE
+ 1 - start_code
), 0); }
2641 /*-----------------------------------------------------------------*/
2645 int local_offsets
[1000];
2646 int local_workspace
[1000];
2647 int codelink
= GET(code
, 1);
2650 /* Because of the way auto-callout works during compile, a callout item
2651 is inserted between OP_COND and an assertion condition. This does not
2652 happen for the other conditions. */
2654 if (code
[LINK_SIZE
+1] == OP_CALLOUT
)
2657 if (PUBL(callout
) != NULL
)
2659 PUBL(callout_block
) cb
;
2660 cb
.version
= 1; /* Version 1 of the callout block */
2661 cb
.callout_number
= code
[LINK_SIZE
+2];
2662 cb
.offset_vector
= offsets
;
2663 #ifdef COMPILE_PCRE8
2664 cb
.subject
= (PCRE_SPTR
)start_subject
;
2666 cb
.subject
= (PCRE_SPTR16
)start_subject
;
2668 cb
.subject_length
= (int)(end_subject
- start_subject
);
2669 cb
.start_match
= (int)(current_subject
- start_subject
);
2670 cb
.current_position
= (int)(ptr
- start_subject
);
2671 cb
.pattern_position
= GET(code
, LINK_SIZE
+ 3);
2672 cb
.next_item_length
= GET(code
, 3 + 2*LINK_SIZE
);
2674 cb
.capture_last
= -1;
2675 cb
.callout_data
= md
->callout_data
;
2676 cb
.mark
= NULL
; /* No (*MARK) support */
2677 if ((rrc
= (*PUBL(callout
))(&cb
)) < 0) return rrc
; /* Abandon */
2679 if (rrc
> 0) break; /* Fail this thread */
2680 code
+= PRIV(OP_lengths
)[OP_CALLOUT
]; /* Skip callout data */
2683 condcode
= code
[LINK_SIZE
+1];
2685 /* Back reference conditions are not supported */
2687 if (condcode
== OP_CREF
|| condcode
== OP_NCREF
)
2688 return PCRE_ERROR_DFA_UCOND
;
2690 /* The DEFINE condition is always false */
2692 if (condcode
== OP_DEF
)
2693 { ADD_ACTIVE(state_offset
+ codelink
+ LINK_SIZE
+ 1, 0); }
2695 /* The only supported version of OP_RREF is for the value RREF_ANY,
2696 which means "test if in any recursion". We can't test for specifically
2699 else if (condcode
== OP_RREF
|| condcode
== OP_NRREF
)
2701 int value
= GET2(code
, LINK_SIZE
+ 2);
2702 if (value
!= RREF_ANY
) return PCRE_ERROR_DFA_UCOND
;
2703 if (md
->recursive
!= NULL
)
2704 { ADD_ACTIVE(state_offset
+ LINK_SIZE
+ 2 + IMM2_SIZE
, 0); }
2705 else { ADD_ACTIVE(state_offset
+ codelink
+ LINK_SIZE
+ 1, 0); }
2708 /* Otherwise, the condition is an assertion */
2713 const pcre_uchar
*asscode
= code
+ LINK_SIZE
+ 1;
2714 const pcre_uchar
*endasscode
= asscode
+ GET(asscode
, 1);
2716 while (*endasscode
== OP_ALT
) endasscode
+= GET(endasscode
, 1);
2718 rc
= internal_dfa_exec(
2719 md
, /* fixed match data */
2720 asscode
, /* this subexpression's code */
2721 ptr
, /* where we currently are */
2722 (int)(ptr
- start_subject
), /* start offset */
2723 local_offsets
, /* offset vector */
2724 sizeof(local_offsets
)/sizeof(int), /* size of same */
2725 local_workspace
, /* workspace vector */
2726 sizeof(local_workspace
)/sizeof(int), /* size of same */
2727 rlevel
); /* function recursion level */
2729 if (rc
== PCRE_ERROR_DFA_UITEM
) return rc
;
2731 (condcode
== OP_ASSERT
|| condcode
== OP_ASSERTBACK
))
2732 { ADD_ACTIVE((int)(endasscode
+ LINK_SIZE
+ 1 - start_code
), 0); }
2734 { ADD_ACTIVE(state_offset
+ codelink
+ LINK_SIZE
+ 1, 0); }
2739 /*-----------------------------------------------------------------*/
2742 dfa_recursion_info
*ri
;
2743 int local_offsets
[1000];
2744 int local_workspace
[1000];
2745 const pcre_uchar
*callpat
= start_code
+ GET(code
, 1);
2746 int recno
= (callpat
== md
->start_code
)? 0 :
2747 GET2(callpat
, 1 + LINK_SIZE
);
2750 DPRINTF(("%.*sStarting regex recursion\n", rlevel
*2-2, SP
));
2752 /* Check for repeating a recursion without advancing the subject
2753 pointer. This should catch convoluted mutual recursions. (Some simple
2754 cases are caught at compile time.) */
2756 for (ri
= md
->recursive
; ri
!= NULL
; ri
= ri
->prevrec
)
2757 if (recno
== ri
->group_num
&& ptr
== ri
->subject_position
)
2758 return PCRE_ERROR_RECURSELOOP
;
2760 /* Remember this recursion and where we started it so as to
2761 catch infinite loops. */
2763 new_recursive
.group_num
= recno
;
2764 new_recursive
.subject_position
= ptr
;
2765 new_recursive
.prevrec
= md
->recursive
;
2766 md
->recursive
= &new_recursive
;
2768 rc
= internal_dfa_exec(
2769 md
, /* fixed match data */
2770 callpat
, /* this subexpression's code */
2771 ptr
, /* where we currently are */
2772 (int)(ptr
- start_subject
), /* start offset */
2773 local_offsets
, /* offset vector */
2774 sizeof(local_offsets
)/sizeof(int), /* size of same */
2775 local_workspace
, /* workspace vector */
2776 sizeof(local_workspace
)/sizeof(int), /* size of same */
2777 rlevel
); /* function recursion level */
2779 md
->recursive
= new_recursive
.prevrec
; /* Done this recursion */
2781 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel
*2-2, SP
,
2784 /* Ran out of internal offsets */
2786 if (rc
== 0) return PCRE_ERROR_DFA_RECURSE
;
2788 /* For each successful matched substring, set up the next state with a
2789 count of characters to skip before trying it. Note that the count is in
2790 characters, not bytes. */
2794 for (rc
= rc
*2 - 2; rc
>= 0; rc
-= 2)
2796 int charcount
= local_offsets
[rc
+1] - local_offsets
[rc
];
2800 const pcre_uchar
*p
= start_subject
+ local_offsets
[rc
];
2801 const pcre_uchar
*pp
= start_subject
+ local_offsets
[rc
+1];
2802 while (p
< pp
) if (NOT_FIRSTCHAR(*p
++)) charcount
--;
2807 ADD_NEW_DATA(-(state_offset
+ LINK_SIZE
+ 1), 0, (charcount
- 1));
2811 ADD_ACTIVE(state_offset
+ LINK_SIZE
+ 1, 0);
2815 else if (rc
!= PCRE_ERROR_NOMATCH
) return rc
;
2819 /*-----------------------------------------------------------------*/
2826 int charcount
, matched_count
;
2827 const pcre_uchar
*local_ptr
= ptr
;
2830 if (codevalue
== OP_BRAPOSZERO
)
2833 codevalue
= *(++code
); /* Codevalue will be one of above BRAs */
2835 else allow_zero
= FALSE
;
2837 /* Loop to match the subpattern as many times as possible as if it were
2838 a complete pattern. */
2840 for (matched_count
= 0;; matched_count
++)
2842 int local_offsets
[2];
2843 int local_workspace
[1000];
2845 int rc
= internal_dfa_exec(
2846 md
, /* fixed match data */
2847 code
, /* this subexpression's code */
2848 local_ptr
, /* where we currently are */
2849 (int)(ptr
- start_subject
), /* start offset */
2850 local_offsets
, /* offset vector */
2851 sizeof(local_offsets
)/sizeof(int), /* size of same */
2852 local_workspace
, /* workspace vector */
2853 sizeof(local_workspace
)/sizeof(int), /* size of same */
2854 rlevel
); /* function recursion level */
2856 /* Failed to match */
2860 if (rc
!= PCRE_ERROR_NOMATCH
) return rc
;
2864 /* Matched: break the loop if zero characters matched. */
2866 charcount
= local_offsets
[1] - local_offsets
[0];
2867 if (charcount
== 0) break;
2868 local_ptr
+= charcount
; /* Advance temporary position ptr */
2871 /* At this point we have matched the subpattern matched_count
2872 times, and local_ptr is pointing to the character after the end of the
2875 if (matched_count
> 0 || allow_zero
)
2877 const pcre_uchar
*end_subpattern
= code
;
2878 int next_state_offset
;
2880 do { end_subpattern
+= GET(end_subpattern
, 1); }
2881 while (*end_subpattern
== OP_ALT
);
2883 (int)(end_subpattern
- start_code
+ LINK_SIZE
+ 1);
2885 /* Optimization: if there are no more active states, and there
2886 are no new states yet set up, then skip over the subject string
2887 right here, to save looping. Otherwise, set up the new state to swing
2888 into action when the end of the matched substring is reached. */
2890 if (i
+ 1 >= active_count
&& new_count
== 0)
2894 ADD_NEW(next_state_offset
, 0);
2898 const pcre_uchar
*p
= ptr
;
2899 const pcre_uchar
*pp
= local_ptr
;
2900 charcount
= (int)(pp
- p
);
2902 if (utf
) while (p
< pp
) if (NOT_FIRSTCHAR(*p
++)) charcount
--;
2904 ADD_NEW_DATA(-next_state_offset
, 0, (charcount
- 1));
2910 /*-----------------------------------------------------------------*/
2914 int local_offsets
[2];
2915 int local_workspace
[1000];
2917 int rc
= internal_dfa_exec(
2918 md
, /* fixed match data */
2919 code
, /* this subexpression's code */
2920 ptr
, /* where we currently are */
2921 (int)(ptr
- start_subject
), /* start offset */
2922 local_offsets
, /* offset vector */
2923 sizeof(local_offsets
)/sizeof(int), /* size of same */
2924 local_workspace
, /* workspace vector */
2925 sizeof(local_workspace
)/sizeof(int), /* size of same */
2926 rlevel
); /* function recursion level */
2930 const pcre_uchar
*end_subpattern
= code
;
2931 int charcount
= local_offsets
[1] - local_offsets
[0];
2932 int next_state_offset
, repeat_state_offset
;
2934 do { end_subpattern
+= GET(end_subpattern
, 1); }
2935 while (*end_subpattern
== OP_ALT
);
2937 (int)(end_subpattern
- start_code
+ LINK_SIZE
+ 1);
2939 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2940 arrange for the repeat state also to be added to the relevant list.
2941 Calculate the offset, or set -1 for no repeat. */
2943 repeat_state_offset
= (*end_subpattern
== OP_KETRMAX
||
2944 *end_subpattern
== OP_KETRMIN
)?
2945 (int)(end_subpattern
- start_code
- GET(end_subpattern
, 1)) : -1;
2947 /* If we have matched an empty string, add the next state at the
2948 current character pointer. This is important so that the duplicate
2949 checking kicks in, which is what breaks infinite loops that match an
2954 ADD_ACTIVE(next_state_offset
, 0);
2957 /* Optimization: if there are no more active states, and there
2958 are no new states yet set up, then skip over the subject string
2959 right here, to save looping. Otherwise, set up the new state to swing
2960 into action when the end of the matched substring is reached. */
2962 else if (i
+ 1 >= active_count
&& new_count
== 0)
2966 ADD_NEW(next_state_offset
, 0);
2968 /* If we are adding a repeat state at the new character position,
2969 we must fudge things so that it is the only current state.
2970 Otherwise, it might be a duplicate of one we processed before, and
2971 that would cause it to be skipped. */
2973 if (repeat_state_offset
>= 0)
2975 next_active_state
= active_states
;
2978 ADD_ACTIVE(repeat_state_offset
, 0);
2986 const pcre_uchar
*p
= start_subject
+ local_offsets
[0];
2987 const pcre_uchar
*pp
= start_subject
+ local_offsets
[1];
2988 while (p
< pp
) if (NOT_FIRSTCHAR(*p
++)) charcount
--;
2991 ADD_NEW_DATA(-next_state_offset
, 0, (charcount
- 1));
2992 if (repeat_state_offset
>= 0)
2993 { ADD_NEW_DATA(-repeat_state_offset
, 0, (charcount
- 1)); }
2996 else if (rc
!= PCRE_ERROR_NOMATCH
) return rc
;
3001 /* ========================================================================== */
3002 /* Handle callouts */
3006 if (PUBL(callout
) != NULL
)
3008 PUBL(callout_block
) cb
;
3009 cb
.version
= 1; /* Version 1 of the callout block */
3010 cb
.callout_number
= code
[1];
3011 cb
.offset_vector
= offsets
;
3012 #ifdef COMPILE_PCRE8
3013 cb
.subject
= (PCRE_SPTR
)start_subject
;
3015 cb
.subject
= (PCRE_SPTR16
)start_subject
;
3017 cb
.subject_length
= (int)(end_subject
- start_subject
);
3018 cb
.start_match
= (int)(current_subject
- start_subject
);
3019 cb
.current_position
= (int)(ptr
- start_subject
);
3020 cb
.pattern_position
= GET(code
, 2);
3021 cb
.next_item_length
= GET(code
, 2 + LINK_SIZE
);
3023 cb
.capture_last
= -1;
3024 cb
.callout_data
= md
->callout_data
;
3025 cb
.mark
= NULL
; /* No (*MARK) support */
3026 if ((rrc
= (*PUBL(callout
))(&cb
)) < 0) return rrc
; /* Abandon */
3029 { ADD_ACTIVE(state_offset
+ PRIV(OP_lengths
)[OP_CALLOUT
], 0); }
3033 /* ========================================================================== */
3034 default: /* Unsupported opcode */
3035 return PCRE_ERROR_DFA_UITEM
;
3038 NEXT_ACTIVE_STATE
: continue;
3040 } /* End of loop scanning active states */
3042 /* We have finished the processing at the current subject character. If no
3043 new states have been set for the next character, we have found all the
3044 matches that we are going to find. If we are at the top level and partial
3045 matching has been requested, check for appropriate conditions.
3047 The "forced_ fail" variable counts the number of (*F) encountered for the
3048 character. If it is equal to the original active_count (saved in
3049 workspace[1]) it means that (*F) was found on every active state. In this
3050 case we don't want to give a partial match.
3052 The "could_continue" variable is true if a state could have continued but
3053 for the fact that the end of the subject was reached. */
3057 if (rlevel
== 1 && /* Top level, and */
3058 could_continue
&& /* Some could go on, and */
3059 forced_fail
!= workspace
[1] && /* Not all forced fail & */
3061 (md
->moptions
& PCRE_PARTIAL_HARD
) != 0 /* Hard partial */
3063 ((md
->moptions
& PCRE_PARTIAL_SOFT
) != 0 && /* Soft partial and */
3064 match_count
< 0) /* no matches */
3067 partial_newline
|| /* Either partial NL */
3069 ptr
>= end_subject
&& /* End of subject and */
3070 ptr
> md
->start_used_ptr
) /* Inspected non-empty string */
3074 if (offsetcount
>= 2)
3076 offsets
[0] = (int)(md
->start_used_ptr
- start_subject
);
3077 offsets
[1] = (int)(end_subject
- start_subject
);
3079 match_count
= PCRE_ERROR_PARTIAL
;
3082 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3083 "%.*s---------------------\n\n", rlevel
*2-2, SP
, rlevel
, match_count
,
3085 break; /* In effect, "return", but see the comment below */
3088 /* One or more states are active for the next character. */
3090 ptr
+= clen
; /* Advance to next subject character */
3091 } /* Loop to move along the subject string */
3093 /* Control gets here from "break" a few lines above. We do it this way because
3094 if we use "return" above, we have compiler trouble. Some compilers warn if
3095 there's nothing here because they think the function doesn't return a value. On
3096 the other hand, if we put a dummy statement here, some more clever compilers
3097 complain that it can't be reached. Sigh. */
3105 /*************************************************
3106 * Execute a Regular Expression - DFA engine *
3107 *************************************************/
3109 /* This external function applies a compiled re to a subject string using a DFA
3110 engine. This function calls the internal function multiple times if the pattern
3114 argument_re points to the compiled expression
3115 extra_data points to extra data or is NULL
3116 subject points to the subject string
3117 length length of subject string (may contain binary zeros)
3118 start_offset where to start in the subject string
3120 offsets vector of match offsets
3121 offsetcount size of same
3122 workspace workspace vector
3123 wscount size of same
3125 Returns: > 0 => number of match offset pairs placed in offsets
3126 = 0 => offsets overflowed; longest matches are present
3127 -1 => failed to match
3128 < -1 => some kind of unexpected problem
3131 #ifdef COMPILE_PCRE8
3132 PCRE_EXP_DEFN
int PCRE_CALL_CONVENTION
3133 pcre_dfa_exec(const pcre
*argument_re
, const pcre_extra
*extra_data
,
3134 const char *subject
, int length
, int start_offset
, int options
, int *offsets
,
3135 int offsetcount
, int *workspace
, int wscount
)
3137 PCRE_EXP_DEFN
int PCRE_CALL_CONVENTION
3138 pcre16_dfa_exec(const pcre16
*argument_re
, const pcre16_extra
*extra_data
,
3139 PCRE_SPTR16 subject
, int length
, int start_offset
, int options
, int *offsets
,
3140 int offsetcount
, int *workspace
, int wscount
)
3143 REAL_PCRE
*re
= (REAL_PCRE
*)argument_re
;
3144 dfa_match_data match_block
;
3145 dfa_match_data
*md
= &match_block
;
3146 BOOL utf
, anchored
, startline
, firstline
;
3147 const pcre_uchar
*current_subject
, *end_subject
;
3148 const pcre_study_data
*study
= NULL
;
3150 const pcre_uchar
*req_char_ptr
;
3151 const pcre_uint8
*start_bits
= NULL
;
3152 BOOL has_first_char
= FALSE
;
3153 BOOL has_req_char
= FALSE
;
3154 pcre_uchar first_char
= 0;
3155 pcre_uchar first_char2
= 0;
3156 pcre_uchar req_char
= 0;
3157 pcre_uchar req_char2
= 0;
3160 /* Plausibility checks */
3162 if ((options
& ~PUBLIC_DFA_EXEC_OPTIONS
) != 0) return PCRE_ERROR_BADOPTION
;
3163 if (re
== NULL
|| subject
== NULL
|| workspace
== NULL
||
3164 (offsets
== NULL
&& offsetcount
> 0)) return PCRE_ERROR_NULL
;
3165 if (offsetcount
< 0) return PCRE_ERROR_BADCOUNT
;
3166 if (wscount
< 20) return PCRE_ERROR_DFA_WSSIZE
;
3167 if (start_offset
< 0 || start_offset
> length
) return PCRE_ERROR_BADOFFSET
;
3169 /* Check that the first field in the block is the magic number. If it is not,
3170 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3171 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3172 means that the pattern is likely compiled with different endianness. */
3174 if (re
->magic_number
!= MAGIC_NUMBER
)
3175 return re
->magic_number
== REVERSED_MAGIC_NUMBER
?
3176 PCRE_ERROR_BADENDIANNESS
:PCRE_ERROR_BADMAGIC
;
3177 if ((re
->flags
& PCRE_MODE
) == 0) return PCRE_ERROR_BADMODE
;
3179 /* If restarting after a partial match, do some sanity checks on the contents
3180 of the workspace. */
3182 if ((options
& PCRE_DFA_RESTART
) != 0)
3184 if ((workspace
[0] & (-2)) != 0 || workspace
[1] < 1 ||
3185 workspace
[1] > (wscount
- 2)/INTS_PER_STATEBLOCK
)
3186 return PCRE_ERROR_DFA_BADRESTART
;
3189 /* Set up study, callout, and table data */
3191 md
->tables
= re
->tables
;
3192 md
->callout_data
= NULL
;
3194 if (extra_data
!= NULL
)
3196 unsigned int flags
= extra_data
->flags
;
3197 if ((flags
& PCRE_EXTRA_STUDY_DATA
) != 0)
3198 study
= (const pcre_study_data
*)extra_data
->study_data
;
3199 if ((flags
& PCRE_EXTRA_MATCH_LIMIT
) != 0) return PCRE_ERROR_DFA_UMLIMIT
;
3200 if ((flags
& PCRE_EXTRA_MATCH_LIMIT_RECURSION
) != 0)
3201 return PCRE_ERROR_DFA_UMLIMIT
;
3202 if ((flags
& PCRE_EXTRA_CALLOUT_DATA
) != 0)
3203 md
->callout_data
= extra_data
->callout_data
;
3204 if ((flags
& PCRE_EXTRA_TABLES
) != 0)
3205 md
->tables
= extra_data
->tables
;
3208 /* Set some local values */
3210 current_subject
= (const pcre_uchar
*)subject
+ start_offset
;
3211 end_subject
= (const pcre_uchar
*)subject
+ length
;
3212 req_char_ptr
= current_subject
- 1;
3215 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3216 utf
= (re
->options
& PCRE_UTF8
) != 0;
3221 anchored
= (options
& (PCRE_ANCHORED
|PCRE_DFA_RESTART
)) != 0 ||
3222 (re
->options
& PCRE_ANCHORED
) != 0;
3224 /* The remaining fixed data for passing around. */
3226 md
->start_code
= (const pcre_uchar
*)argument_re
+
3227 re
->name_table_offset
+ re
->name_count
* re
->name_entry_size
;
3228 md
->start_subject
= (const pcre_uchar
*)subject
;
3229 md
->end_subject
= end_subject
;
3230 md
->start_offset
= start_offset
;
3231 md
->moptions
= options
;
3232 md
->poptions
= re
->options
;
3234 /* If the BSR option is not set at match time, copy what was set
3237 if ((md
->moptions
& (PCRE_BSR_ANYCRLF
|PCRE_BSR_UNICODE
)) == 0)
3239 if ((re
->options
& (PCRE_BSR_ANYCRLF
|PCRE_BSR_UNICODE
)) != 0)
3240 md
->moptions
|= re
->options
& (PCRE_BSR_ANYCRLF
|PCRE_BSR_UNICODE
);
3242 else md
->moptions
|= PCRE_BSR_ANYCRLF
;
3246 /* Handle different types of newline. The three bits give eight cases. If
3247 nothing is set at run time, whatever was used at compile time applies. */
3249 switch ((((options
& PCRE_NEWLINE_BITS
) == 0)? re
->options
: (pcre_uint32
)options
) &
3252 case 0: newline
= NEWLINE
; break; /* Compile-time default */
3253 case PCRE_NEWLINE_CR
: newline
= CHAR_CR
; break;
3254 case PCRE_NEWLINE_LF
: newline
= CHAR_NL
; break;
3255 case PCRE_NEWLINE_CR
+
3256 PCRE_NEWLINE_LF
: newline
= (CHAR_CR
<< 8) | CHAR_NL
; break;
3257 case PCRE_NEWLINE_ANY
: newline
= -1; break;
3258 case PCRE_NEWLINE_ANYCRLF
: newline
= -2; break;
3259 default: return PCRE_ERROR_BADNEWLINE
;
3264 md
->nltype
= NLTYPE_ANYCRLF
;
3266 else if (newline
< 0)
3268 md
->nltype
= NLTYPE_ANY
;
3272 md
->nltype
= NLTYPE_FIXED
;
3276 md
->nl
[0] = (newline
>> 8) & 255;
3277 md
->nl
[1] = newline
& 255;
3282 md
->nl
[0] = newline
;
3286 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3287 back the character offset. */
3290 if (utf
&& (options
& PCRE_NO_UTF8_CHECK
) == 0)
3293 int errorcode
= PRIV(valid_utf
)((pcre_uchar
*)subject
, length
, &erroroffset
);
3296 if (offsetcount
>= 2)
3298 offsets
[0] = erroroffset
;
3299 offsets
[1] = errorcode
;
3301 return (errorcode
<= PCRE_UTF8_ERR5
&& (options
& PCRE_PARTIAL_HARD
) != 0)?
3302 PCRE_ERROR_SHORTUTF8
: PCRE_ERROR_BADUTF8
;
3304 if (start_offset
> 0 && start_offset
< length
&&
3305 NOT_FIRSTCHAR(((PCRE_PUCHAR
)subject
)[start_offset
]))
3306 return PCRE_ERROR_BADUTF8_OFFSET
;
3310 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3311 is a feature that makes it possible to save compiled regex and re-use them
3312 in other programs later. */
3314 if (md
->tables
== NULL
) md
->tables
= PRIV(default_tables
);
3316 /* The "must be at the start of a line" flags are used in a loop when finding
3319 startline
= (re
->flags
& PCRE_STARTLINE
) != 0;
3320 firstline
= (re
->options
& PCRE_FIRSTLINE
) != 0;
3322 /* Set up the first character to match, if available. The first_byte value is
3323 never set for an anchored regular expression, but the anchoring may be forced
3324 at run time, so we have to test for anchoring. The first char may be unset for
3325 an unanchored pattern, of course. If there's no first char and the pattern was
3326 studied, there may be a bitmap of possible first characters. */
3330 if ((re
->flags
& PCRE_FIRSTSET
) != 0)
3332 has_first_char
= TRUE
;
3333 first_char
= first_char2
= (pcre_uchar
)(re
->first_char
);
3334 if ((re
->flags
& PCRE_FCH_CASELESS
) != 0)
3336 first_char2
= TABLE_GET(first_char
, md
->tables
+ fcc_offset
, first_char
);
3337 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3338 if (utf
&& first_char
> 127)
3339 first_char2
= UCD_OTHERCASE(first_char
);
3345 if (!startline
&& study
!= NULL
&&
3346 (study
->flags
& PCRE_STUDY_MAPPED
) != 0)
3347 start_bits
= study
->start_bits
;
3351 /* For anchored or unanchored matches, there may be a "last known required
3354 if ((re
->flags
& PCRE_REQCHSET
) != 0)
3356 has_req_char
= TRUE
;
3357 req_char
= req_char2
= (pcre_uchar
)(re
->req_char
);
3358 if ((re
->flags
& PCRE_RCH_CASELESS
) != 0)
3360 req_char2
= TABLE_GET(req_char
, md
->tables
+ fcc_offset
, req_char
);
3361 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3362 if (utf
&& req_char
> 127)
3363 req_char2
= UCD_OTHERCASE(req_char
);
3368 /* Call the main matching function, looping for a non-anchored regex after a
3369 failed match. If not restarting, perform certain optimizations at the start of
3376 if ((options
& PCRE_DFA_RESTART
) == 0)
3378 const pcre_uchar
*save_end_subject
= end_subject
;
3380 /* If firstline is TRUE, the start of the match is constrained to the first
3381 line of a multiline string. Implement this by temporarily adjusting
3382 end_subject so that we stop scanning at a newline. If the match fails at
3383 the newline, later code breaks this loop. */
3387 PCRE_PUCHAR t
= current_subject
;
3391 while (t
< md
->end_subject
&& !IS_NEWLINE(t
))
3394 ACROSSCHAR(t
< end_subject
, *t
, t
++);
3399 while (t
< md
->end_subject
&& !IS_NEWLINE(t
)) t
++;
3403 /* There are some optimizations that avoid running the match if a known
3404 starting point is not found. However, there is an option that disables
3405 these, for testing and for ensuring that all callouts do actually occur.
3406 The option can be set in the regex by (*NO_START_OPT) or passed in
3407 match-time options. */
3409 if (((options
| re
->options
) & PCRE_NO_START_OPTIMIZE
) == 0)
3411 /* Advance to a known first char. */
3415 if (first_char
!= first_char2
)
3416 while (current_subject
< end_subject
&&
3417 *current_subject
!= first_char
&& *current_subject
!= first_char2
)
3420 while (current_subject
< end_subject
&&
3421 *current_subject
!= first_char
)
3425 /* Or to just after a linebreak for a multiline match if possible */
3429 if (current_subject
> md
->start_subject
+ start_offset
)
3434 while (current_subject
< end_subject
&&
3435 !WAS_NEWLINE(current_subject
))
3438 ACROSSCHAR(current_subject
< end_subject
, *current_subject
,
3444 while (current_subject
< end_subject
&& !WAS_NEWLINE(current_subject
))
3447 /* If we have just passed a CR and the newline option is ANY or
3448 ANYCRLF, and we are now at a LF, advance the match position by one
3451 if (current_subject
[-1] == CHAR_CR
&&
3452 (md
->nltype
== NLTYPE_ANY
|| md
->nltype
== NLTYPE_ANYCRLF
) &&
3453 current_subject
< end_subject
&&
3454 *current_subject
== CHAR_NL
)
3459 /* Or to a non-unique first char after study */
3461 else if (start_bits
!= NULL
)
3463 while (current_subject
< end_subject
)
3465 unsigned int c
= *current_subject
;
3466 #ifndef COMPILE_PCRE8
3467 if (c
> 255) c
= 255;
3469 if ((start_bits
[c
/8] & (1 << (c
&7))) == 0)
3472 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3473 /* In non 8-bit mode, the iteration will stop for
3474 characters > 255 at the beginning or not stop at all. */
3476 ACROSSCHAR(current_subject
< end_subject
, *current_subject
,
3485 /* Restore fudged end_subject */
3487 end_subject
= save_end_subject
;
3489 /* The following two optimizations are disabled for partial matching or if
3490 disabling is explicitly requested (and of course, by the test above, this
3491 code is not obeyed when restarting after a partial match). */
3493 if (((options
| re
->options
) & PCRE_NO_START_OPTIMIZE
) == 0 &&
3494 (options
& (PCRE_PARTIAL_HARD
|PCRE_PARTIAL_SOFT
)) == 0)
3496 /* If the pattern was studied, a minimum subject length may be set. This
3497 is a lower bound; no actual string of that length may actually match the
3498 pattern. Although the value is, strictly, in characters, we treat it as
3499 bytes to avoid spending too much time in this optimization. */
3501 if (study
!= NULL
&& (study
->flags
& PCRE_STUDY_MINLEN
) != 0 &&
3502 (pcre_uint32
)(end_subject
- current_subject
) < study
->minlength
)
3503 return PCRE_ERROR_NOMATCH
;
3505 /* If req_char is set, we know that that character must appear in the
3506 subject for the match to succeed. If the first character is set, req_char
3507 must be later in the subject; otherwise the test starts at the match
3508 point. This optimization can save a huge amount of work in patterns with
3509 nested unlimited repeats that aren't going to match. Writing separate
3510 code for cased/caseless versions makes it go faster, as does using an
3511 autoincrement and backing off on a match.
3513 HOWEVER: when the subject string is very, very long, searching to its end
3514 can take a long time, and give bad performance on quite ordinary
3515 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3516 string... so we don't do this when the string is sufficiently long. */
3518 if (has_req_char
&& end_subject
- current_subject
< REQ_BYTE_MAX
)
3520 PCRE_PUCHAR p
= current_subject
+ (has_first_char
? 1:0);
3522 /* We don't need to repeat the search if we haven't yet reached the
3523 place we found it at last time. */
3525 if (p
> req_char_ptr
)
3527 if (req_char
!= req_char2
)
3529 while (p
< end_subject
)
3532 if (pp
== req_char
|| pp
== req_char2
) { p
--; break; }
3537 while (p
< end_subject
)
3539 if (*p
++ == req_char
) { p
--; break; }
3543 /* If we can't find the required character, break the matching loop,
3544 which will cause a return or PCRE_ERROR_NOMATCH. */
3546 if (p
>= end_subject
) break;
3548 /* If we have found the required character, save the point where we
3549 found it, so that we don't search again next time round the loop if
3550 the start hasn't passed this character yet. */
3556 } /* End of optimizations that are done when not restarting */
3558 /* OK, now we can do the business */
3560 md
->start_used_ptr
= current_subject
;
3561 md
->recursive
= NULL
;
3563 rc
= internal_dfa_exec(
3564 md
, /* fixed match data */
3565 md
->start_code
, /* this subexpression's code */
3566 current_subject
, /* where we currently are */
3567 start_offset
, /* start offset in subject */
3568 offsets
, /* offset vector */
3569 offsetcount
, /* size of same */
3570 workspace
, /* workspace vector */
3571 wscount
, /* size of same */
3572 0); /* function recurse level */
3574 /* Anything other than "no match" means we are done, always; otherwise, carry
3575 on only if not anchored. */
3577 if (rc
!= PCRE_ERROR_NOMATCH
|| anchored
) return rc
;
3579 /* Advance to the next subject character unless we are at the end of a line
3580 and firstline is set. */
3582 if (firstline
&& IS_NEWLINE(current_subject
)) break;
3587 ACROSSCHAR(current_subject
< end_subject
, *current_subject
,
3591 if (current_subject
> end_subject
) break;
3593 /* If we have just passed a CR and we are now at a LF, and the pattern does
3594 not contain any explicit matches for \r or \n, and the newline option is CRLF
3595 or ANY or ANYCRLF, advance the match position by one more character. */
3597 if (current_subject
[-1] == CHAR_CR
&&
3598 current_subject
< end_subject
&&
3599 *current_subject
== CHAR_NL
&&
3600 (re
->flags
& PCRE_HASCRORLF
) == 0 &&
3601 (md
->nltype
== NLTYPE_ANY
||
3602 md
->nltype
== NLTYPE_ANYCRLF
||
3606 } /* "Bumpalong" loop */
3608 return PCRE_ERROR_NOMATCH
;
3611 /* End of pcre_dfa_exec.c */