Reduce signed comparison warnings.
[sljit.git] / regex_src / regexJIT.c
blob3f55b3408214fb996436f957a4d3e3b2174187f4
1 /*
2 * Stack-less Just-In-Time compiler
4 * Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
6 * Redistribution and use in source and binary forms, with or without modification, are
7 * permitted provided that the following conditions are met:
9 * 1. Redistributions of source code must retain the above copyright notice, this list of
10 * conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice, this list
13 * of conditions and the following disclaimer in the documentation and/or other materials
14 * provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "sljitLir.h"
28 #include "regexJIT.h"
30 #include <stdlib.h>
32 #ifdef REGEX_MATCH_VERBOSE
33 #include <stdio.h>
34 #endif
36 /* Extra, hidden flags:
37 {id!} where id > 0 found in the code. */
38 #define REGEX_ID_CHECK 0x100
39 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search,
40 which starts with [\r\n] character range. */
41 #define REGEX_FAKE_MATCH_BEGIN 0x200
42 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search,
43 which ends with [\r\n] character range. */
44 #define REGEX_FAKE_MATCH_END 0x400
46 /* --------------------------------------------------------------------- */
47 /* Structures for JIT-ed pattern matching */
48 /* --------------------------------------------------------------------- */
50 struct regex_machine
52 /* flags. */
53 int flags;
54 /* Number of state descriptors for one term. */
55 sljit_sw no_states;
56 /* Total size. */
57 sljit_sw size;
59 union {
60 void *init_match;
61 sljit_sw (SLJIT_FUNC *call_init)(void *next, void* match);
62 } u;
63 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
64 struct sljit_function_context context;
65 #endif
67 void *continue_match;
69 /* Variable sized array to contain the handler addresses. */
70 sljit_uw entry_addrs[1];
73 struct regex_match
75 /* Current and next state array. */
76 sljit_sw *current;
77 sljit_sw *next;
78 /* Starting. */
79 sljit_sw head;
80 /* String character index (ever increasing). */
81 sljit_sw index;
82 /* Best match found so far (members in priority order). */
83 sljit_sw best_begin;
84 sljit_sw best_end;
85 sljit_sw best_id;
86 /* Bool flags (encoded as word). */
87 sljit_sw fast_quit;
88 sljit_sw fast_forward;
89 /* Machine. */
90 struct regex_machine *machine;
92 union {
93 void *continue_match;
94 void (SLJIT_FUNC *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length);
95 } u;
97 /* Variable sized array to contain the state arrays. */
98 sljit_sw states[1];
101 /* State vector
102 ITEM[0] - pointer to the address inside the machine code
103 ITEM[1] - next pointer
104 ITEM[2] - string started from (optional)
105 ITEM[3] - max ID (optional) */
107 /* Register allocation. */
108 /* Current state array (loaded & stored: regex_match->current). */
109 #define R_CURR_STATE SLJIT_S0
110 /* Next state array (loaded & stored: regex_match->next). */
111 #define R_NEXT_STATE SLJIT_S1
112 /* Head (loaded & stored: regex_match->head). */
113 #define R_NEXT_HEAD SLJIT_S2
114 /* String fragment pointer. */
115 #define R_STRING SLJIT_S3
116 /* String fragment length. */
117 #define R_LENGTH SLJIT_S4
118 /* 'struct regex_match*' */
119 #define R_REGEX_MATCH SLJIT_R0
120 /* Current character. */
121 #define R_CURR_CHAR SLJIT_R1
122 /* Temporary register. */
123 #define R_TEMP SLJIT_R2
124 /* Caches the regex_match->best_begin. */
125 #define R_BEST_BEGIN SLJIT_R3
126 /* Current character index. */
127 #define R_CURR_INDEX SLJIT_R4
129 /* --------------------------------------------------------------------- */
130 /* Stack management */
131 /* --------------------------------------------------------------------- */
133 /* Try to allocate 2^n blocks. */
134 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item)))
136 struct stack_item {
137 int type;
138 int value;
141 struct stack_fragment_data {
142 struct stack_fragment *next;
143 struct stack_fragment *prev;
146 struct stack_fragment {
147 struct stack_fragment_data data;
148 struct stack_item items[STACK_FRAGMENT_SIZE];
151 struct stack {
152 struct stack_fragment *first;
153 struct stack_fragment *last;
154 sljit_uw index;
155 sljit_uw count;
158 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
160 static void stack_check(struct stack *stack)
162 struct stack_fragment *curr;
163 int found;
165 if (!stack)
166 return;
168 SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE);
170 if (stack->first == NULL) {
171 SLJIT_ASSERT(stack->first == NULL && stack->last == NULL);
172 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
173 return;
176 found = 0;
177 if (stack->last == NULL) {
178 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
179 found = 1;
181 else
182 SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0);
184 SLJIT_ASSERT(stack->first->data.prev == NULL);
185 curr = stack->first;
186 while (curr) {
187 if (curr == stack->last)
188 found = 1;
189 if (curr->data.next)
190 SLJIT_ASSERT(curr->data.next->data.prev == curr);
191 curr = curr->data.next;
193 SLJIT_ASSERT(found);
196 #endif
198 static void stack_init(struct stack *stack)
200 stack->first = NULL;
201 stack->last = NULL;
202 stack->index = STACK_FRAGMENT_SIZE - 1;
203 stack->count = 0;
206 static void stack_destroy(struct stack *stack)
208 struct stack_fragment *curr = stack->first;
209 struct stack_fragment *prev;
211 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
212 stack_check(stack);
213 #endif
215 while (curr) {
216 prev = curr;
217 curr = curr->data.next;
218 SLJIT_FREE(prev, NULL);
222 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack)
224 SLJIT_ASSERT(stack->last);
225 return stack->last->items + stack->index;
228 static int stack_push(struct stack *stack, int type, int value)
230 if (stack->last) {
231 stack->index++;
232 if (stack->index >= STACK_FRAGMENT_SIZE) {
233 stack->index = 0;
234 if (!stack->last->data.next) {
235 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
236 if (!stack->last->data.next)
237 return 1;
238 stack->last->data.next->data.next = NULL;
239 stack->last->data.next->data.prev = stack->last;
241 stack->last = stack->last->data.next;
244 else if (!stack->first) {
245 stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
246 if (!stack->last)
247 return 1;
248 stack->last->data.prev = NULL;
249 stack->last->data.next = NULL;
250 stack->first = stack->last;
251 stack->index = 0;
253 else {
254 stack->last = stack->first;
255 stack->index = 0;
257 stack->last->items[stack->index].type = type;
258 stack->last->items[stack->index].value = value;
259 stack->count++;
260 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
261 stack_check(stack);
262 #endif
263 return 0;
266 static struct stack_item* stack_pop(struct stack *stack)
268 struct stack_item *ret = stack_top(stack);
270 if (stack->index > 0)
271 stack->index--;
272 else {
273 stack->last = stack->last->data.prev;
274 stack->index = STACK_FRAGMENT_SIZE - 1;
277 stack->count--;
278 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
279 stack_check(stack);
280 #endif
281 return ret;
284 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst)
286 *dst = *src;
289 static int stack_push_copy(struct stack *stack, int items, int length)
291 struct stack_fragment *frag1;
292 struct stack_fragment *frag2;
293 sljit_uw ind1, ind2;
294 sljit_uw counter;
296 SLJIT_ASSERT(stack->count >= (sljit_uw)length && items <= length && items > 0);
298 /* Allocate the necessary elements. */
299 counter = (sljit_uw)items;
300 frag1 = stack->last;
301 ind1 = stack->index;
302 while (counter > 0) {
303 if (stack->index + counter >= STACK_FRAGMENT_SIZE) {
304 SLJIT_ASSERT(counter >= STACK_FRAGMENT_SIZE - stack->index - 1 + 1);
305 counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1;
306 stack->index = 0;
307 if (!stack->last->data.next) {
308 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
309 if (!stack->last->data.next)
310 return 1;
311 stack->last->data.next->data.next = NULL;
312 stack->last->data.next->data.prev = stack->last;
314 stack->last = stack->last->data.next;
316 else {
317 stack->index += counter;
318 counter = 0;
322 frag2 = stack->last;
323 ind2 = stack->index;
324 while (length > 0) {
325 frag2->items[ind2] = frag1->items[ind1];
327 if (ind1 == 0) {
328 ind1 = STACK_FRAGMENT_SIZE;
329 frag1 = frag1->data.prev;
331 if (ind2 == 0) {
332 ind2 = STACK_FRAGMENT_SIZE;
333 frag2 = frag2->data.prev;
336 ind1--;
337 ind2--;
338 length--;
341 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
342 stack_check(stack);
343 #endif
344 stack->count += (sljit_uw)items;
345 return 0;
348 /* --------------------------------------------------------------------- */
349 /* Parser */
350 /* --------------------------------------------------------------------- */
352 enum {
353 /* Common. */
354 type_begin,
355 type_end,
356 type_char,
357 type_newline,
358 type_id,
359 type_rng_start,
360 type_rng_end,
361 type_rng_char,
362 type_rng_left,
363 type_rng_right,
365 /* generator only. */
366 type_branch,
367 type_jump,
369 /* Parser only. */
370 type_open_br,
371 type_close_br,
372 type_select,
373 type_asterisk,
374 type_plus_sign,
375 type_qestion_mark
378 struct compiler_common {
379 /* Temporary stacks. */
380 struct stack stack;
381 struct stack depth;
382 /* REGEX_ flags. */
383 int flags;
384 /* Encoded size of the dfa representation. */
385 sljit_uw dfa_size;
386 /* Number of terms. */
387 sljit_sw terms_size;
388 /* Number of state descriptors for one term (same as machine->no_states). */
389 sljit_sw no_states;
390 /* Number of type_rng_(char|left)-s in the longest character range. */
391 sljit_sw longest_range_size;
393 /* DFA linear representation (size: dfa_size). */
394 struct stack_item *dfa_transitions;
395 /* Term id and search state pairs (size: dfa_size). */
396 struct stack_item *search_states;
398 /* sljit compiler */
399 struct sljit_compiler *compiler;
400 /* Machine data, which must be kept for later use. */
401 struct regex_machine *machine;
402 /* Temporary space for jumps (size: longest_range_size). */
403 struct sljit_jump **range_jump_list;
406 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result)
408 int value = 0;
410 SLJIT_ASSERT(length > 0);
411 if (*regex_string < '0' || *regex_string > '9') {
412 *result = -1;
413 return regex_string;
416 while (length > 0 && *regex_string >= '0' && *regex_string <= '9') {
417 value = value * 10 + (*regex_string - '0');
418 length--;
419 regex_string++;
422 *result = value;
423 return regex_string;
426 static int iterate(struct stack *stack, int min, int max)
428 struct stack it;
429 struct stack_item *item;
430 int count = -1;
431 int len = 0;
432 int depth = 0;
434 stack_clone(stack, &it);
436 /* Calculate size. */
437 while (count < 0) {
438 item = stack_pop(&it);
439 switch (item->type) {
440 case type_id:
441 case type_rng_end:
442 case type_rng_char:
443 case type_rng_left:
444 case type_rng_right:
445 case type_plus_sign:
446 case type_qestion_mark:
447 len++;
448 break;
450 case type_asterisk:
451 len += 2;
452 break;
454 case type_close_br:
455 depth++;
456 break;
458 case type_open_br:
459 SLJIT_ASSERT(depth > 0);
460 depth--;
461 if (depth == 0)
462 count = (int)it.count;
463 break;
465 case type_select:
466 SLJIT_ASSERT(depth > 0);
467 len += 2;
468 break;
470 default:
471 SLJIT_ASSERT(item->type != type_begin && item->type != type_end);
472 if (depth == 0)
473 count = (int)it.count;
474 len++;
475 break;
479 if (min == 0 && max == 0) {
480 /* {0,0} case, not {0,} case: delete subtree. */
481 stack_clone(&it, stack);
482 /* and put an empty bracket expression instead of it. */
483 if (stack_push(stack, type_open_br, 0))
484 return REGEX_MEMORY_ERROR;
485 if (stack_push(stack, type_close_br, 0))
486 return REGEX_MEMORY_ERROR;
487 return len;
490 count = (int)stack->count - count;
492 /* Put an open bracket before the sequence. */
493 if (stack_push_copy(stack, 1, count))
494 return -1;
496 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
497 SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0);
498 #else
499 stack_push(&it, type_open_br, 0);
500 #endif
502 /* Copy the data. */
503 if (max > 0) {
504 len = len * (max - 1);
505 max -= min;
506 /* Insert ? operators. */
507 len += max;
509 if (min > 0) {
510 min--;
511 while (min > 0) {
512 if (stack_push_copy(stack, count, count))
513 return -1;
514 min--;
516 if (max > 0) {
517 if (stack_push_copy(stack, count, count))
518 return -1;
519 if (stack_push(stack, type_qestion_mark, 0))
520 return REGEX_MEMORY_ERROR;
521 count++;
522 max--;
525 else {
526 SLJIT_ASSERT(max > 0);
527 max--;
528 count++;
529 if (stack_push(stack, type_qestion_mark, 0))
530 return REGEX_MEMORY_ERROR;
533 while (max > 0) {
534 if (stack_push_copy(stack, count, count))
535 return -1;
536 max--;
539 else {
540 SLJIT_ASSERT(min > 0);
541 min--;
542 /* Insert + operator. */
543 len = len * min + 1;
544 while (min > 0) {
545 if (stack_push_copy(stack, count, count))
546 return -1;
547 min--;
550 if (stack_push(stack, type_plus_sign, 0))
551 return REGEX_MEMORY_ERROR;
554 /* Close the opened bracket. */
555 if (stack_push(stack, type_close_br, 0))
556 return REGEX_MEMORY_ERROR;
558 return len;
561 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_uw *dfa_size, int begin)
563 /* We only know that *regex_string == { . */
564 int val1, val2;
565 const regex_char_t *base_from = regex_string;
566 const regex_char_t *from;
568 length--;
569 regex_string++;
571 /* Decode left value. */
572 val2 = -1;
573 if (length == 0)
574 return -2;
575 if (*regex_string == ',') {
576 val1 = 0;
577 length--;
578 regex_string++;
580 else {
581 from = regex_string;
582 regex_string = decode_number(regex_string, length, &val1);
583 if (val1 < 0)
584 return -2;
585 length -= (int)(regex_string - from);
587 if (length == 0)
588 return -2;
589 if (*regex_string == '}') {
590 val2 = val1;
591 if (val1 == 0)
592 val1 = -1;
594 else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') {
595 /* Non posix extension. */
596 if (stack_push(stack, type_id, val1))
597 return -1;
598 (*dfa_size)++;
599 return (int)(regex_string - base_from) + 1;
601 else {
602 if (*regex_string != ',')
603 return -2;
604 length--;
605 regex_string++;
609 if (begin)
610 return -2;
612 /* Decode right value. */
613 if (val2 == -1) {
614 if (length == 0)
615 return -2;
616 if (*regex_string == '}')
617 val2 = 0;
618 else {
619 from = regex_string;
620 regex_string = decode_number(regex_string, length, &val2);
621 length -= (int)(regex_string - from);
622 if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1)
623 return -2;
624 if (val2 == 0) {
625 SLJIT_ASSERT(val1 == 0);
626 val1 = -1;
631 /* Fast cases. */
632 if (val1 > 1 || val2 > 1) {
633 val1 = iterate(stack, val1, val2);
634 if (val1 < 0)
635 return -1;
636 *dfa_size += (sljit_uw)val1;
638 else if (val1 == 0 && val2 == 0) {
639 if (stack_push(stack, type_asterisk, 0))
640 return -1;
641 *dfa_size += 2;
643 else if (val1 == 1 && val2 == 0) {
644 if (stack_push(stack, type_plus_sign, 0))
645 return -1;
646 (*dfa_size)++;
648 else if (val1 == 0 && val2 == 1) {
649 if (stack_push(stack, type_qestion_mark, 0))
650 return -1;
651 (*dfa_size)++;
653 else if (val1 == -1) {
654 val1 = iterate(stack, 0, 0);
655 if (val1 < 0)
656 return -1;
657 *dfa_size -= (sljit_uw)val1;
658 SLJIT_ASSERT(*dfa_size >= 2);
660 else {
661 /* Ignore. */
662 SLJIT_ASSERT(val1 == 1 && val2 == 1);
664 return (int)(regex_string - base_from);
667 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
669 struct stack* stack = &compiler_common->stack;
670 const regex_char_t *base_from = regex_string;
671 regex_char_t left_char, right_char, tmp_char;
673 length--;
674 regex_string++;
676 if (length == 0)
677 return -2;
679 if (*regex_string != '^') {
680 if (stack_push(stack, type_rng_start, 0))
681 return -1;
683 else {
684 length--;
685 regex_string++;
687 if (length == 0)
688 return -2;
690 if (stack_push(stack, type_rng_start, 1))
691 return -1;
693 /* For both the type_rng_start & type_rng_end. */
694 compiler_common->dfa_size += 2;
696 /* Range must be at least 1 character. */
697 if (*regex_string == ']') {
698 length--;
699 regex_string++;
700 if (stack_push(stack, type_rng_char, ']'))
701 return -1;
702 compiler_common->dfa_size++;
705 while (1) {
706 if (length == 0)
707 return -2;
709 if (*regex_string == ']')
710 break;
712 if (*regex_string != '\\')
713 left_char = *regex_string;
714 else {
715 regex_string++;
716 length--;
717 if (length == 0)
718 return -2;
719 left_char = *regex_string;
721 regex_string++;
722 length--;
724 /* Is a range here? */
725 if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') {
726 regex_string++;
727 length--;
729 if (*regex_string != '\\')
730 right_char = *regex_string;
731 else {
732 regex_string++;
733 length--;
734 if (length == 0)
735 return -2;
736 right_char = *regex_string;
738 regex_string++;
739 length--;
741 if (left_char > right_char) {
742 /* Swap if necessary. */
743 tmp_char = left_char;
744 left_char = right_char;
745 right_char = tmp_char;
748 if (stack_push(stack, type_rng_left, left_char))
749 return -1;
750 if (stack_push(stack, type_rng_right, right_char))
751 return -1;
752 compiler_common->dfa_size += 2;
754 else {
755 if (stack_push(stack, type_rng_char, left_char))
756 return -1;
757 compiler_common->dfa_size++;
761 if (stack_push(stack, type_rng_end, 0))
762 return -1;
763 return (int)(regex_string - base_from);
766 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
768 /* Depth of bracketed expressions. */
769 int depth = 0;
770 /* Have we already found a term? '1' if not yet. */
771 int begin = 1;
772 /* Cache stack pointer. */
773 struct stack* stack = &compiler_common->stack;
774 int tmp;
776 /* Type_begin and type_end. */
777 compiler_common->dfa_size = 2;
778 stack_init(stack);
779 if (stack_push(stack, type_begin, 0))
780 return REGEX_MEMORY_ERROR;
782 if (length > 0 && *regex_string == '^') {
783 compiler_common->flags |= REGEX_MATCH_BEGIN;
784 length--;
785 regex_string++;
788 if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) {
789 /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */
790 compiler_common->flags &= ~REGEX_MATCH_BEGIN;
791 compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN;
792 /* and append a new-line search. */
793 if (stack_push(stack, type_newline, 0))
794 return REGEX_MEMORY_ERROR;
795 compiler_common->dfa_size++;
796 /* Begin intentionally kept as 1. */
799 while (length > 0) {
800 switch (*regex_string) {
801 case '\\' :
802 length--;
803 regex_string++;
804 if (length == 0)
805 return REGEX_INVALID_REGEX;
806 if (stack_push(stack, type_char, *regex_string))
807 return REGEX_MEMORY_ERROR;
808 begin = 0;
809 compiler_common->dfa_size++;
810 break;
812 case '.' :
813 if (stack_push(stack, type_rng_start, 1))
814 return REGEX_MEMORY_ERROR;
815 if (compiler_common->flags & REGEX_NEWLINE) {
816 if (stack_push(stack, type_rng_char, '\n'))
817 return REGEX_MEMORY_ERROR;
818 if (stack_push(stack, type_rng_char, '\r'))
819 return REGEX_MEMORY_ERROR;
820 compiler_common->dfa_size += 2;
822 if (stack_push(stack, type_rng_end, 1))
823 return REGEX_MEMORY_ERROR;
824 begin = 0;
825 compiler_common->dfa_size += 2;
826 break;
828 case '(' :
829 depth++;
830 if (stack_push(stack, type_open_br, 0))
831 return REGEX_MEMORY_ERROR;
832 begin = 1;
833 break;
835 case ')' :
836 if (depth == 0)
837 return REGEX_INVALID_REGEX;
838 depth--;
839 if (stack_push(stack, type_close_br, 0))
840 return REGEX_MEMORY_ERROR;
841 begin = 0;
842 break;
844 case '|' :
845 if (stack_push(stack, type_select, 0))
846 return REGEX_MEMORY_ERROR;
847 begin = 1;
848 compiler_common->dfa_size += 2;
849 break;
851 case '*' :
852 if (begin)
853 return REGEX_INVALID_REGEX;
854 if (stack_push(stack, type_asterisk, 0))
855 return REGEX_MEMORY_ERROR;
856 compiler_common->dfa_size += 2;
857 break;
859 case '?' :
860 case '+' :
861 if (begin)
862 return REGEX_INVALID_REGEX;
863 if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0))
864 return REGEX_MEMORY_ERROR;
865 compiler_common->dfa_size++;
866 break;
868 case '{' :
869 tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin);
871 if (tmp >= 0) {
872 length -= tmp;
873 regex_string += tmp;
875 else if (tmp == -1)
876 return REGEX_MEMORY_ERROR;
877 else {
878 /* Not a valid range expression. */
879 SLJIT_ASSERT(tmp == -2);
880 if (stack_push(stack, type_char, '{'))
881 return REGEX_MEMORY_ERROR;
882 compiler_common->dfa_size++;
884 break;
886 case '[' :
887 tmp = parse_char_range(regex_string, length, compiler_common);
888 if (tmp >= 0) {
889 length -= tmp;
890 regex_string += tmp;
892 else if (tmp == -1)
893 return REGEX_MEMORY_ERROR;
894 else {
895 SLJIT_ASSERT(tmp == -2);
896 return REGEX_INVALID_REGEX;
898 begin = 0;
899 break;
901 default:
902 if (length == 1 && *regex_string == '$') {
903 compiler_common->flags |= REGEX_MATCH_END;
904 break;
906 if (stack_push(stack, type_char, *regex_string))
907 return REGEX_MEMORY_ERROR;
908 begin = 0;
909 compiler_common->dfa_size++;
910 break;
912 length--;
913 regex_string++;
916 if (depth != 0)
917 return REGEX_INVALID_REGEX;
919 if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) {
920 /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */
921 compiler_common->flags &= ~REGEX_MATCH_END;
922 compiler_common->flags |= REGEX_FAKE_MATCH_END;
923 /* and append a new-line search. */
924 if (stack_push(stack, type_newline, 1))
925 return REGEX_MEMORY_ERROR;
926 compiler_common->dfa_size++;
927 /* Begin intentionally kept as 1. */
930 if (stack_push(stack, type_end, 0))
931 return REGEX_MEMORY_ERROR;
933 return REGEX_NO_ERROR;
936 /* --------------------------------------------------------------------- */
937 /* Generating machine state transitions */
938 /* --------------------------------------------------------------------- */
940 #define PUT_TRANSITION(typ, val) \
941 do { \
942 --transitions_ptr; \
943 transitions_ptr->type = typ; \
944 transitions_ptr->value = val; \
945 } while (0)
947 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth)
949 struct stack_item *item;
951 while (1) {
952 item = stack_top(depth);
954 switch (item->type) {
955 case type_asterisk:
956 SLJIT_ASSERT(transitions[item->value].type == type_branch);
957 transitions[item->value].value = (int)(transitions_ptr - transitions);
958 PUT_TRANSITION(type_branch, item->value + 1);
959 break;
961 case type_plus_sign:
962 SLJIT_ASSERT(transitions[item->value].type == type_branch);
963 transitions[item->value].value = (int)(transitions_ptr - transitions);
964 break;
966 case type_qestion_mark:
967 PUT_TRANSITION(type_branch, item->value);
968 break;
970 default:
971 return transitions_ptr;
973 stack_pop(depth);
977 static int generate_transitions(struct compiler_common *compiler_common)
979 struct stack *stack = &compiler_common->stack;
980 struct stack *depth = &compiler_common->depth;
981 struct stack_item *transitions_ptr;
982 struct stack_item *item;
984 stack_init(depth);
985 compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
986 if (!compiler_common->dfa_transitions)
987 return REGEX_MEMORY_ERROR;
989 /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */
990 transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size;
991 while (stack->count > 0) {
992 item = stack_pop(stack);
993 switch (item->type) {
994 case type_begin:
995 case type_open_br:
996 item = stack_pop(depth);
997 if (item->type == type_select)
998 PUT_TRANSITION(type_branch, item->value + 1);
999 else
1000 SLJIT_ASSERT(item->type == type_close_br);
1001 if (stack->count == 0)
1002 PUT_TRANSITION(type_begin, 0);
1003 else
1004 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1005 break;
1007 case type_end:
1008 case type_close_br:
1009 if (item->type == type_end)
1010 *--transitions_ptr = *item;
1011 if (stack_push(depth, type_close_br, (int)(transitions_ptr - compiler_common->dfa_transitions)))
1012 return REGEX_MEMORY_ERROR;
1013 break;
1015 case type_select:
1016 item = stack_top(depth);
1017 if (item->type == type_select) {
1018 SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump);
1019 PUT_TRANSITION(type_branch, item->value + 1);
1020 PUT_TRANSITION(type_jump, item->value);
1021 item->value = (int)(transitions_ptr - compiler_common->dfa_transitions);
1023 else {
1024 SLJIT_ASSERT(item->type == type_close_br);
1025 item->type = type_select;
1026 PUT_TRANSITION(type_jump, item->value);
1027 item->value = (int)(transitions_ptr - compiler_common->dfa_transitions);
1029 break;
1031 case type_asterisk:
1032 case type_plus_sign:
1033 case type_qestion_mark:
1034 if (item->type != type_qestion_mark)
1035 PUT_TRANSITION(type_branch, 0);
1036 if (stack_push(depth, item->type, (int)(transitions_ptr - compiler_common->dfa_transitions)))
1037 return REGEX_MEMORY_ERROR;
1038 break;
1040 case type_char:
1041 case type_newline:
1042 case type_rng_start:
1043 /* Requires handle_iteratives. */
1044 *--transitions_ptr = *item;
1045 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1046 break;
1048 default:
1049 *--transitions_ptr = *item;
1050 break;
1054 SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr);
1055 SLJIT_ASSERT(depth->count == 0);
1056 return REGEX_NO_ERROR;
1059 #undef PUT_TRANSITION
1061 #ifdef REGEX_MATCH_VERBOSE
1063 static void verbose_transitions(struct compiler_common *compiler_common)
1065 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1066 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1067 struct stack_item *search_states_ptr = compiler_common->search_states;
1068 int pos;
1070 printf("-----------------\nTransitions\n-----------------\n");
1071 pos = 0;
1072 while (transitions_ptr < transitions_end) {
1073 printf("[%3d] ", pos++);
1074 if (search_states_ptr->type >= 0)
1075 printf("(%3d) ", search_states_ptr->type);
1076 switch (transitions_ptr->type) {
1077 case type_begin:
1078 printf("type_begin\n");
1079 break;
1081 case type_end:
1082 printf("type_end\n");
1083 break;
1085 case type_char:
1086 if (transitions_ptr->value >= ' ')
1087 printf("type_char '%c'\n", transitions_ptr->value);
1088 else
1089 printf("type_char 0x%x\n", transitions_ptr->value);
1090 break;
1092 case type_newline:
1093 printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)");
1094 break;
1096 case type_id:
1097 printf("type_id %d\n", transitions_ptr->value);
1098 break;
1100 case type_rng_start:
1101 printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)");
1102 break;
1104 case type_rng_end:
1105 printf("type_rng_end\n");
1106 break;
1108 case type_rng_char:
1109 if (transitions_ptr->value >= ' ')
1110 printf("type_rng_char '%c'\n", transitions_ptr->value);
1111 else
1112 printf("type_rng_char 0x%x\n", transitions_ptr->value);
1113 break;
1115 case type_rng_left:
1116 if (transitions_ptr->value >= ' ')
1117 printf("type_rng_left '%c'\n", transitions_ptr->value);
1118 else
1119 printf("type_rng_left 0x%x\n", transitions_ptr->value);
1120 break;
1122 case type_rng_right:
1123 if (transitions_ptr->value >= ' ')
1124 printf("type_rng_right '%c'\n", transitions_ptr->value);
1125 else
1126 printf("type_rng_right 0x%x\n", transitions_ptr->value);
1127 break;
1129 case type_branch:
1130 printf("type_branch -> %d\n", transitions_ptr->value);
1131 break;
1133 case type_jump:
1134 printf("type_jump -> %d\n", transitions_ptr->value);
1135 break;
1137 default:
1138 printf("UNEXPECTED TYPE\n");
1139 break;
1141 transitions_ptr++;
1142 search_states_ptr++;
1144 printf("flags: ");
1145 if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)))
1146 printf("none ");
1147 if (compiler_common->flags & REGEX_MATCH_BEGIN)
1148 printf("REGEX_MATCH_BEGIN ");
1149 if (compiler_common->flags & REGEX_MATCH_END)
1150 printf("REGEX_MATCH_END ");
1151 if (compiler_common->flags & REGEX_NEWLINE)
1152 printf("REGEX_NEWLINE ");
1153 if (compiler_common->flags & REGEX_ID_CHECK)
1154 printf("REGEX_ID_CHECK ");
1155 if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)
1156 printf("REGEX_FAKE_MATCH_BEGIN ");
1157 if (compiler_common->flags & REGEX_FAKE_MATCH_END)
1158 printf("REGEX_FAKE_MATCH_END ");
1159 if (compiler_common->longest_range_size > 0)
1160 printf("(longest range: %ld) ", (long)compiler_common->longest_range_size);
1161 printf("\n");
1164 #endif
1166 /* --------------------------------------------------------------------- */
1167 /* Utilities */
1168 /* --------------------------------------------------------------------- */
1170 static int generate_search_states(struct compiler_common *compiler_common)
1172 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1173 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1174 struct stack_item *search_states_ptr;
1175 struct stack_item *rng_start = NULL;
1177 compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2;
1178 compiler_common->longest_range_size = 0;
1179 compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
1180 if (!compiler_common->search_states)
1181 return REGEX_MEMORY_ERROR;
1183 search_states_ptr = compiler_common->search_states;
1184 while (transitions_ptr < transitions_end) {
1185 switch (transitions_ptr->type) {
1186 case type_begin:
1187 case type_end:
1188 search_states_ptr->type = 0;
1189 break;
1191 case type_char:
1192 search_states_ptr->type = (int)compiler_common->terms_size++;
1193 break;
1195 case type_newline:
1196 if (transitions_ptr->value)
1197 search_states_ptr->type = 1;
1198 else
1199 search_states_ptr->type = (int)compiler_common->terms_size++;
1200 SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2);
1201 break;
1203 case type_id:
1204 if (transitions_ptr->value > 0)
1205 compiler_common->flags |= REGEX_ID_CHECK;
1206 search_states_ptr->type = -1;
1207 break;
1209 case type_rng_start:
1210 search_states_ptr->type = (int)compiler_common->terms_size;
1211 rng_start = search_states_ptr;
1212 break;
1214 case type_rng_end:
1215 search_states_ptr->type = (int)compiler_common->terms_size++;
1216 /* This is an over estimation. */
1217 if (compiler_common->longest_range_size < search_states_ptr - rng_start)
1218 compiler_common->longest_range_size = search_states_ptr - rng_start;
1219 break;
1221 default:
1222 search_states_ptr->type = -1;
1223 break;
1225 search_states_ptr->value = -1;
1226 search_states_ptr++;
1227 transitions_ptr++;
1229 return REGEX_NO_ERROR;
1232 static int trace_transitions(int from, struct compiler_common *compiler_common)
1234 int id = 0;
1235 struct stack *stack = &compiler_common->stack;
1236 struct stack *depth = &compiler_common->depth;
1237 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1238 struct stack_item *search_states = compiler_common->search_states;
1240 SLJIT_ASSERT(search_states[from].type >= 0);
1242 from++;
1244 /* Be prepared for any paths (loops, etc). */
1245 while (1) {
1246 if (dfa_transitions[from].type == type_id)
1247 if (id < dfa_transitions[from].value)
1248 id = dfa_transitions[from].value;
1250 if (search_states[from].value < id) {
1251 /* Forward step. */
1252 if (search_states[from].value == -1)
1253 if (stack_push(stack, 0, from))
1254 return REGEX_MEMORY_ERROR;
1255 search_states[from].value = id;
1257 if (dfa_transitions[from].type == type_branch) {
1258 if (stack_push(depth, id, from))
1259 return REGEX_MEMORY_ERROR;
1260 from++;
1261 continue;
1263 else if (dfa_transitions[from].type == type_jump) {
1264 from = dfa_transitions[from].value;
1265 continue;
1267 else if (search_states[from].type < 0) {
1268 from++;
1269 continue;
1273 /* Back tracking. */
1274 if (depth->count > 0) {
1275 id = stack_top(depth)->type;
1276 from = dfa_transitions[stack_pop(depth)->value].value;
1277 continue;
1279 return 0;
1283 /* --------------------------------------------------------------------- */
1284 /* Code generator */
1285 /* --------------------------------------------------------------------- */
1287 #define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * (sljit_sw)sizeof(sljit_sw))
1288 #define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * (sljit_sw)sizeof(sljit_sw)))
1290 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1291 CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4))
1293 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1294 CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1296 #define EMIT_OP2U(type, arg1, arg2, arg3, arg4) \
1297 CHECK(sljit_emit_op2u(compiler, type, arg1, arg2, arg3, arg4))
1299 #define EMIT_LABEL(label) \
1300 label = sljit_emit_label(compiler); \
1301 CHECK(!label)
1303 #define EMIT_JUMP(jump, type) \
1304 jump = sljit_emit_jump(compiler, type); \
1305 CHECK(!jump)
1307 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1308 jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \
1309 CHECK(!jump)
1311 /* CHECK depends on the use case. */
1313 #define CHECK(exp) \
1314 if (SLJIT_UNLIKELY(exp)) \
1315 return REGEX_MEMORY_ERROR
1317 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg)
1319 struct sljit_compiler *compiler = compiler_common->compiler;
1320 struct stack *stack = &compiler_common->stack;
1321 struct stack_item *search_states = compiler_common->search_states;
1322 int flags = compiler_common->flags;
1323 sljit_sw no_states = compiler_common->no_states;
1324 sljit_sw head = 0;
1325 sljit_sw offset, value;
1327 if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) {
1328 CHECK(trace_transitions(0, compiler_common));
1330 else {
1331 CHECK(trace_transitions(1, compiler_common));
1334 while (stack->count > 0) {
1335 value = stack_pop(stack)->value;
1336 if (search_states[value].type >= 0) {
1337 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1338 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1339 if (offset > 0)
1340 head = offset;
1342 if (!(flags & REGEX_MATCH_BEGIN)) {
1343 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0);
1344 if (flags & REGEX_ID_CHECK) {
1345 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value);
1348 else if (flags & REGEX_ID_CHECK) {
1349 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value);
1352 search_states[value].value = -1;
1354 if (reg == R_NEXT_STATE) {
1355 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1357 else if (flags & REGEX_FAKE_MATCH_BEGIN) {
1358 SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value);
1359 offset = TERM_OFFSET_OF(search_states[1].type, 0);
1361 SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN));
1363 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1364 head = offset;
1366 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1);
1367 if (flags & REGEX_ID_CHECK) {
1368 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0);
1371 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head);
1372 return REGEX_NO_ERROR;
1375 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index)
1377 struct sljit_compiler *compiler = compiler_common->compiler;
1378 struct stack *stack = &compiler_common->stack;
1379 struct stack_item *search_states = compiler_common->search_states;
1380 sljit_sw offset;
1381 int flags;
1382 sljit_sw no_states;
1383 sljit_sw value;
1384 struct sljit_jump *jump1;
1385 struct sljit_jump *jump2;
1386 struct sljit_jump *jump3;
1387 struct sljit_jump *jump4;
1388 struct sljit_jump *jump5;
1389 struct sljit_label *label1;
1391 flags = compiler_common->flags;
1392 no_states = compiler_common->no_states;
1394 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
1395 if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) {
1396 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1399 while (stack->count > 0) {
1400 value = stack_pop(stack)->value;
1401 if (search_states[value].type >= 0) {
1402 #ifdef REGEX_MATCH_VERBOSE
1403 if (flags & REGEX_MATCH_VERBOSE)
1404 printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value);
1405 #endif
1406 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1408 if (!(flags & REGEX_ID_CHECK)) {
1409 if (!(flags & REGEX_MATCH_BEGIN)) {
1410 /* Check whether item is inserted. */
1411 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), SLJIT_IMM, -1);
1412 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), R_NEXT_HEAD, 0);
1413 if (offset > 0) {
1414 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1416 EMIT_JUMP(jump2, SLJIT_JUMP);
1418 /* Check whether old index <= index. */
1419 EMIT_LABEL(label1);
1420 sljit_set_label(jump1, label1);
1422 EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1424 EMIT_LABEL(label1);
1425 sljit_set_label(jump2, label1);
1426 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1428 EMIT_LABEL(label1);
1429 sljit_set_label(jump1, label1);
1431 else {
1432 /* Check whether item is inserted. */
1433 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), SLJIT_IMM, -1);
1434 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), R_NEXT_HEAD, 0);
1435 if (offset > 0) {
1436 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1438 EMIT_LABEL(label1);
1439 sljit_set_label(jump1, label1);
1442 else {
1443 if (!(flags & REGEX_MATCH_BEGIN)) {
1444 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1446 /* Check whether item is inserted. */
1447 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), SLJIT_IMM, -1);
1448 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), R_NEXT_HEAD, 0);
1449 if (offset > 0) {
1450 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1452 EMIT_JUMP(jump2, SLJIT_JUMP);
1454 /* Check whether old index != index. */
1455 EMIT_LABEL(label1);
1456 sljit_set_label(jump1, label1);
1458 EMIT_OP2U(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1459 EMIT_JUMP(jump1, SLJIT_LESS);
1460 EMIT_JUMP(jump3, SLJIT_NOT_EQUAL); /* Greater. */
1462 /* Old index == index. */
1463 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1464 if (search_states[value].value > 0) {
1465 EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1467 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1468 EMIT_LABEL(label1);
1469 sljit_set_label(jump4, label1);
1472 EMIT_OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1473 EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL);
1474 EMIT_JUMP(jump5, SLJIT_JUMP);
1476 /* Overwrite index & id. */
1477 EMIT_LABEL(label1);
1478 sljit_set_label(jump3, label1);
1479 sljit_set_label(jump2, label1);
1480 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1482 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1483 if (search_states[value].value > 0) {
1484 EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1486 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1487 EMIT_LABEL(label1);
1488 sljit_set_label(jump3, label1);
1491 EMIT_LABEL(label1);
1492 sljit_set_label(jump5, label1);
1493 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1495 /* Exit. */
1496 EMIT_LABEL(label1);
1497 sljit_set_label(jump1, label1);
1498 sljit_set_label(jump4, label1);
1500 else {
1501 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1503 if (search_states[value].value > 0) {
1504 EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1506 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1507 EMIT_LABEL(label1);
1508 sljit_set_label(jump1, label1);
1511 /* Check whether item is inserted. */
1512 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), SLJIT_IMM, -1);
1513 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), R_NEXT_HEAD, 0);
1514 if (offset > 0) {
1515 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1517 EMIT_JUMP(jump2, SLJIT_JUMP);
1519 /* Check whether old id >= id. */
1520 EMIT_LABEL(label1);
1521 sljit_set_label(jump1, label1);
1523 EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1525 EMIT_LABEL(label1);
1526 sljit_set_label(jump2, label1);
1527 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1529 EMIT_LABEL(label1);
1530 sljit_set_label(jump1, label1);
1534 search_states[value].value = -1;
1537 #ifdef REGEX_MATCH_VERBOSE
1538 if (flags & REGEX_MATCH_VERBOSE)
1539 printf("\n");
1540 #endif
1541 return REGEX_NO_ERROR;
1544 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label)
1546 struct sljit_compiler *compiler = compiler_common->compiler;
1547 struct sljit_jump *jump;
1548 struct sljit_jump *clear_states_jump;
1549 struct sljit_label *label;
1550 struct sljit_label *leave_label;
1551 struct sljit_label *begin_loop_label;
1553 /* Priority order: best_begin > best_end > best_id.
1554 In other words:
1555 if (new best_begin > old test_begin) do nothing
1556 otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing
1557 therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */
1559 /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */
1561 if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) {
1562 EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1563 EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1564 sljit_set_label(jump, end_check_label);
1566 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1567 if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) {
1568 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1570 else {
1571 if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) {
1572 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2);
1574 else {
1575 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1);
1578 if (compiler_common->flags & REGEX_ID_CHECK) {
1579 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3));
1582 EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0);
1584 EMIT_LABEL(leave_label);
1585 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0);
1586 EMIT_JUMP(jump, SLJIT_JUMP);
1587 sljit_set_label(jump, end_check_label);
1589 /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */
1590 EMIT_LABEL(label);
1591 sljit_set_label(clear_states_jump, label);
1593 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1594 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
1596 /* Begin of the loop. */
1597 EMIT_LABEL(begin_loop_label);
1598 EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0);
1599 sljit_set_label(jump, leave_label);
1601 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0);
1602 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw));
1603 EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0);
1605 /* Case 1: keep this case. */
1606 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0);
1607 EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0);
1609 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1610 EMIT_JUMP(jump, SLJIT_JUMP);
1611 sljit_set_label(jump, begin_loop_label);
1613 /* Case 2: remove this case. */
1614 EMIT_LABEL(label);
1615 sljit_set_label(clear_states_jump, label);
1617 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1);
1619 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1620 EMIT_JUMP(jump, SLJIT_JUMP);
1621 sljit_set_label(jump, begin_loop_label);
1623 else {
1624 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0);
1625 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
1626 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1627 if (compiler_common->flags & REGEX_ID_CHECK) {
1628 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1630 EMIT_JUMP(jump, SLJIT_JUMP);
1631 sljit_set_label(jump, end_check_label);
1633 return REGEX_NO_ERROR;
1636 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label)
1638 struct sljit_compiler *compiler = compiler_common->compiler;
1639 struct stack *stack = &compiler_common->stack;
1640 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1641 struct stack_item *search_states = compiler_common->search_states;
1642 int ind;
1643 struct sljit_jump *jump;
1644 int init_range = 1, prev_value = 0;
1646 while (stack->count > 0) {
1647 ind = stack_pop(stack)->value;
1648 search_states[ind].value = -1;
1649 if (search_states[ind].type >= 0) {
1650 if (dfa_transitions[ind].type == type_char) {
1651 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1652 sljit_set_label(jump, fast_forward_label);
1654 else if (dfa_transitions[ind].type == type_rng_start) {
1655 SLJIT_ASSERT(!dfa_transitions[ind].value);
1656 ind++;
1657 while (dfa_transitions[ind].type != type_rng_end) {
1658 if (dfa_transitions[ind].type == type_rng_char) {
1659 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1660 sljit_set_label(jump, fast_forward_label);
1662 else {
1663 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1664 if (init_range) {
1665 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1666 init_range = 0;
1668 if (dfa_transitions[ind].value != prev_value) {
1669 /* Best compatibility to all archs. */
1670 prev_value -= dfa_transitions[ind].value;
1671 if (prev_value < 0) {
1672 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1674 else {
1675 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1677 prev_value = dfa_transitions[ind].value;
1679 EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1680 sljit_set_label(jump, fast_forward_label);
1681 ind++;
1683 ind++;
1686 else {
1687 SLJIT_ASSERT(dfa_transitions[ind].type == type_newline);
1688 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1689 sljit_set_label(jump, fast_forward_label);
1690 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1691 sljit_set_label(jump, fast_forward_label);
1695 return REGEX_NO_ERROR;
1698 static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind)
1700 struct sljit_compiler *compiler = compiler_common->compiler;
1701 struct sljit_jump *jump1;
1702 struct sljit_jump *jump2;
1703 struct sljit_label *label;
1704 sljit_sw no_states;
1705 sljit_sw offset;
1707 /* Check whether a new-line character is found. */
1708 EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1709 EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1711 no_states = compiler_common->no_states;
1712 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1713 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1714 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1715 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1717 EMIT_LABEL(label);
1718 sljit_set_label(jump1, label);
1719 sljit_set_label(jump2, label);
1720 return REGEX_NO_ERROR;
1723 #undef CHECK
1725 #define CHECK(exp) \
1726 if (SLJIT_UNLIKELY(exp)) \
1727 return 0
1729 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label)
1731 while (*range_jump_list) {
1732 sljit_set_label(*range_jump_list, label);
1733 range_jump_list++;
1737 static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind)
1739 struct sljit_compiler *compiler = compiler_common->compiler;
1740 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1741 struct sljit_jump **range_jump_list = compiler_common->range_jump_list;
1742 int invert = dfa_transitions[ind].value;
1743 struct sljit_label *label;
1744 sljit_sw no_states;
1745 sljit_sw offset;
1746 int init_range = 1, prev_value = 0;
1748 ind++;
1750 while (dfa_transitions[ind].type != type_rng_end) {
1751 if (dfa_transitions[ind].type == type_rng_char) {
1752 EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1753 range_jump_list++;
1755 else {
1756 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1757 if (init_range) {
1758 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1759 init_range = 0;
1761 if (dfa_transitions[ind].value != prev_value) {
1762 /* Best compatibility to all archs. */
1763 prev_value -= dfa_transitions[ind].value;
1764 if (prev_value < 0) {
1765 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1767 else {
1768 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1770 prev_value = dfa_transitions[ind].value;
1772 EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1773 range_jump_list++;
1774 ind++;
1776 ind++;
1779 *range_jump_list = NULL;
1781 if (!invert) {
1782 no_states = compiler_common->no_states;
1783 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1784 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1785 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1786 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1788 EMIT_LABEL(label);
1789 range_set_label(compiler_common->range_jump_list, label);
1790 /* Clears the jump list. */
1791 *compiler_common->range_jump_list = NULL;
1793 return ind;
1796 #undef TERM_OFFSET_OF
1797 #undef EMIT_OP1
1798 #undef EMIT_OP2
1799 #undef EMIT_LABEL
1800 #undef EMIT_JUMP
1801 #undef EMIT_CMP
1802 #undef CHECK
1804 /* --------------------------------------------------------------------- */
1805 /* Main compiler */
1806 /* --------------------------------------------------------------------- */
1808 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * (sljit_sw)sizeof(sljit_sw))
1810 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1811 CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4))
1813 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1814 CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1816 #define EMIT_LABEL(label) \
1817 label = sljit_emit_label(compiler_common.compiler); \
1818 CHECK(!label)
1820 #define EMIT_JUMP(jump, type) \
1821 jump = sljit_emit_jump(compiler_common.compiler, type); \
1822 CHECK(!jump)
1824 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1825 jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \
1826 CHECK(!jump)
1828 /* A do {} while(0) expression helps to avoid goto statements. */
1829 #define BEGIN_GUARD \
1830 do {
1832 #define END_GUARD \
1833 } while(0);
1835 #define CHECK(exp) \
1836 if (SLJIT_UNLIKELY(exp)) \
1837 break;
1839 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error)
1841 struct compiler_common compiler_common;
1842 sljit_sw ind;
1843 int error_code, done, suggest_fast_forward;
1844 /* ID of an empty match (-1 if not reachable). */
1845 int empty_match_id;
1847 struct sljit_jump *jump;
1848 struct sljit_jump *best_match_found_jump;
1849 struct sljit_jump *fast_forward_jump = NULL;
1850 struct sljit_jump *length_is_zero_jump;
1851 struct sljit_jump *end_check_jump = NULL;
1852 struct sljit_jump *best_match_check_jump = NULL;
1853 struct sljit_jump *non_greedy_end_jump = NULL;
1854 struct sljit_label *label;
1855 struct sljit_label *end_check_label = NULL;
1856 struct sljit_label *start_label;
1857 struct sljit_label *fast_forward_label;
1858 struct sljit_label *fast_forward_return_label;
1860 if (error)
1861 *error = REGEX_NO_ERROR;
1862 #ifdef REGEX_MATCH_VERBOSE
1863 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE);
1864 #else
1865 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE);
1866 #endif
1868 /* Step 1: parsing (Left->Right).
1869 Syntax check and AST generator. */
1870 error_code = parse(regex_string, length, &compiler_common);
1871 if (error_code) {
1872 stack_destroy(&compiler_common.stack);
1873 if (error)
1874 *error = error_code;
1875 return NULL;
1878 /* Step 2: generating branches (Right->Left). */
1879 error_code = generate_transitions(&compiler_common);
1880 stack_destroy(&compiler_common.stack);
1881 stack_destroy(&compiler_common.depth);
1882 if (error_code) {
1883 if (compiler_common.dfa_transitions)
1884 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1885 if (error)
1886 *error = error_code;
1887 return NULL;
1890 /* Step 3: Generate necessary data for depth-first search (Left->Right). */
1891 error_code = generate_search_states(&compiler_common);
1892 if (error_code) {
1893 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1894 if (error)
1895 *error = error_code;
1896 return NULL;
1899 #ifdef REGEX_MATCH_VERBOSE
1900 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1901 verbose_transitions(&compiler_common);
1902 #endif
1904 /* Step 4: Left->Right generate code. */
1905 stack_init(&compiler_common.stack);
1906 stack_init(&compiler_common.depth);
1907 done = 0;
1908 compiler_common.machine = NULL;
1909 compiler_common.compiler = NULL;
1910 compiler_common.range_jump_list = NULL;
1912 BEGIN_GUARD
1914 compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (sljit_uw)(compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL);
1915 CHECK(!compiler_common.machine);
1917 compiler_common.compiler = sljit_create_compiler(NULL, NULL);
1918 CHECK(!compiler_common.compiler);
1920 if (compiler_common.longest_range_size > 0) {
1921 compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * (sljit_uw)compiler_common.longest_range_size, NULL);
1922 CHECK(!compiler_common.range_jump_list);
1925 if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN))
1926 compiler_common.no_states = 4;
1927 else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN))
1928 compiler_common.no_states = 2;
1929 else
1930 compiler_common.no_states = 3;
1932 compiler_common.machine->flags = compiler_common.flags;
1933 compiler_common.machine->no_states = compiler_common.no_states;
1934 compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size;
1936 /* Study the regular expression. */
1937 empty_match_id = -1;
1938 suggest_fast_forward = 1;
1939 if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) {
1940 CHECK(trace_transitions(0, &compiler_common));
1941 while (compiler_common.stack.count > 0) {
1942 ind = stack_pop(&compiler_common.stack)->value;
1943 if (compiler_common.search_states[ind].type == 0) {
1944 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1945 suggest_fast_forward = 0;
1946 empty_match_id = compiler_common.search_states[ind].value;
1948 else if (compiler_common.search_states[ind].type > 0) {
1949 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end);
1950 if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value)
1951 suggest_fast_forward = 0;
1953 compiler_common.search_states[ind].value = -1;
1956 else {
1957 SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline);
1958 CHECK(trace_transitions(1, &compiler_common));
1959 while (compiler_common.stack.count > 0) {
1960 ind = stack_pop(&compiler_common.stack)->value;
1961 if (compiler_common.search_states[ind].type == 0) {
1962 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1963 suggest_fast_forward = 0;
1964 empty_match_id = compiler_common.search_states[ind].value;
1966 compiler_common.search_states[ind].value = -1;
1970 /* Step 4.1: Generate entry. */
1971 CHECK(sljit_emit_enter(compiler_common.compiler, 0, SLJIT_ARGS3(VOID, P, P, 32), 5, 5, 0, 0, 0));
1973 /* Copy arguments to their place. */
1974 EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0);
1975 EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0);
1976 EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1);
1978 /* Init global registers. */
1979 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
1980 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
1981 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
1982 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin));
1983 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index));
1985 /* Check whether the best match has already found in a previous frame. */
1986 EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0);
1987 EMIT_JUMP(best_match_found_jump, SLJIT_JUMP);
1989 #ifdef REGEX_MATCH_VERBOSE
1990 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1991 printf("\n-----------------\nTrace\n-----------------\n");
1992 #endif
1994 /* Step 4.2: Generate code for state 0. */
1995 EMIT_LABEL(label);
1996 sljit_emit_op0(compiler_common.compiler, SLJIT_ENDBR);
1997 compiler_common.machine->entry_addrs[0] = (sljit_uw)label;
1999 /* Swapping current and next. */
2000 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0);
2001 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0);
2002 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0);
2004 /* Checking whether the best case needs to be updated. */
2005 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2006 EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2007 EMIT_LABEL(end_check_label);
2009 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2010 EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1);
2012 /* Checking whether best case has already found. */
2013 if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) {
2014 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2015 /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */
2016 EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2018 else {
2019 /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */
2020 EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2024 EMIT_LABEL(start_label);
2025 sljit_set_label(jump, start_label);
2027 if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) {
2028 EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2031 /* Loading the next character. */
2032 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1);
2033 EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL);
2035 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0);
2036 #ifdef REGEX_USE_8BIT_CHARS
2037 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2038 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1);
2039 #else
2040 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2041 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2);
2042 #endif
2043 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0);
2045 #ifdef REGEX_MATCH_VERBOSE
2046 if (compiler_common.flags & REGEX_MATCH_VERBOSE) {
2047 printf("(%3d): ", 0);
2048 CHECK(trace_transitions(0, &compiler_common));
2049 while (compiler_common.stack.count > 0) {
2050 ind = stack_pop(&compiler_common.stack)->value;
2051 if (compiler_common.search_states[ind].type >= 0)
2052 printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value);
2053 compiler_common.search_states[ind].value = -1;
2055 printf("\n");
2057 #endif
2059 EMIT_LABEL(fast_forward_return_label);
2060 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2061 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1);
2062 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2063 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2066 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0);
2067 CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE));
2068 /* And branching to the first state. */
2069 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2071 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2072 EMIT_LABEL(label);
2073 sljit_set_label(jump, label);
2076 /* This is the case where we only have to reset the R_NEXT_HEAD. */
2077 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
2078 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2079 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2081 /* Fast-forward loop. */
2082 if (fast_forward_jump) {
2083 /* Quit from fast-forward loop. */
2084 EMIT_LABEL(fast_forward_label);
2085 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2086 EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0);
2087 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0);
2088 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0);
2089 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
2090 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
2091 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
2093 /* Update the start field of the locations. */
2094 CHECK(trace_transitions(0, &compiler_common));
2095 while (compiler_common.stack.count > 0) {
2096 ind = stack_pop(&compiler_common.stack)->value;
2097 if (compiler_common.search_states[ind].type >= 0) {
2098 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0);
2100 compiler_common.search_states[ind].value = -1;
2102 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2103 EMIT_JUMP(jump, SLJIT_JUMP);
2104 sljit_set_label(jump, fast_forward_return_label);
2106 /* Start fast-forward. */
2107 EMIT_LABEL(label);
2108 sljit_set_label(fast_forward_jump, label);
2110 /* Moving everything to registers. */
2111 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2112 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2113 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2114 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0);
2115 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0);
2116 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0);
2118 /* Fast forward mainloop. */
2119 EMIT_LABEL(label);
2120 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1);
2121 EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL);
2123 #ifdef REGEX_USE_8BIT_CHARS
2124 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2125 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1);
2126 #else
2127 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2128 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2);
2129 #endif
2131 CHECK(trace_transitions(0, &compiler_common));
2132 CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label));
2134 EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2135 EMIT_JUMP(jump, SLJIT_JUMP);
2136 sljit_set_label(jump, label);
2138 /* String is finished. */
2139 EMIT_LABEL(label);
2140 sljit_set_label(fast_forward_jump, label);
2141 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0);
2142 EMIT_JUMP(fast_forward_jump, SLJIT_JUMP);
2145 /* End check. */
2146 if (end_check_jump) {
2147 EMIT_LABEL(label);
2148 sljit_set_label(end_check_jump, label);
2150 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2151 CHECK(compile_end_check(&compiler_common, end_check_label));
2153 else {
2154 /* Since we leave, we do not need to update the R_BEST_BEGIN. */
2155 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2156 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
2157 if (compiler_common.flags & REGEX_ID_CHECK) {
2158 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
2160 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2161 EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP);
2165 /* Finish check. */
2166 if (best_match_check_jump) {
2167 EMIT_LABEL(label);
2168 sljit_set_label(best_match_check_jump, label);
2170 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2171 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2172 sljit_set_label(jump, start_label);
2174 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2177 /* Leaving matching and storing the necessary values. */
2178 EMIT_LABEL(label);
2179 sljit_set_label(length_is_zero_jump, label);
2180 if (non_greedy_end_jump)
2181 sljit_set_label(non_greedy_end_jump, label);
2183 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0);
2184 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2185 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2186 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2188 /* Exit from JIT. */
2189 EMIT_LABEL(label);
2190 sljit_set_label(best_match_found_jump, label);
2191 if (fast_forward_jump)
2192 sljit_set_label(fast_forward_jump, label);
2193 CHECK(sljit_emit_return_void(compiler_common.compiler));
2195 for (ind = 1; ind < (sljit_sw)compiler_common.dfa_size - 1; ind++) {
2196 if (compiler_common.search_states[ind].type >= 0) {
2197 SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size);
2198 EMIT_LABEL(label);
2199 sljit_emit_op0(compiler_common.compiler, SLJIT_ENDBR);
2200 compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label;
2202 if (compiler_common.dfa_transitions[ind].type == type_char) {
2203 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value);
2205 else if (compiler_common.dfa_transitions[ind].type == type_rng_start) {
2206 ind = compile_range_check(&compiler_common, ind);
2207 CHECK(!ind);
2209 else {
2210 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2211 CHECK(compile_newline_check(&compiler_common, ind));
2214 CHECK(trace_transitions((int)ind, &compiler_common));
2215 #ifdef REGEX_MATCH_VERBOSE
2216 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2217 printf("(%3d): ", compiler_common.search_states[ind].type);
2218 #endif
2219 CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type));
2221 if (compiler_common.dfa_transitions[ind].type == type_char) {
2222 EMIT_LABEL(label);
2223 sljit_set_label(jump, label);
2225 else if (compiler_common.dfa_transitions[ind].type == type_rng_end) {
2226 EMIT_LABEL(label);
2227 range_set_label(compiler_common.range_jump_list, label);
2229 else {
2230 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2233 /* Branch to the next item in the list. */
2234 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1));
2235 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1);
2236 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2240 if (ind == (sljit_sw)compiler_common.dfa_size - 1) {
2241 /* Generate an init stub function. */
2242 EMIT_LABEL(label);
2243 CHECK(sljit_emit_enter(compiler_common.compiler, 0, SLJIT_ARGS2(W, P, P), 3, 3, 0, 0, 0));
2245 if (empty_match_id == -1) {
2246 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1);
2247 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0);
2249 else {
2250 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2251 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id);
2254 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2);
2256 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) {
2257 /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */
2258 SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0);
2259 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2260 /* R_CURR_INDEX (put to R_TEMP) is zero. */
2261 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0);
2263 CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE));
2265 else {
2266 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2268 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0));
2270 compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler);
2271 #ifndef SLJIT_INDIRECT_CALL
2272 compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label);
2273 #else
2274 sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile);
2275 #endif
2276 #ifdef REGEX_MATCH_VERBOSE
2277 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2278 printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match);
2279 #endif
2280 if (compiler_common.machine->continue_match) {
2281 for (ind = 0; ind < compiler_common.terms_size; ++ind)
2282 compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]);
2283 done = 1;
2286 END_GUARD
2288 stack_destroy(&compiler_common.stack);
2289 stack_destroy(&compiler_common.depth);
2290 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
2291 SLJIT_FREE(compiler_common.search_states, NULL);
2292 if (compiler_common.range_jump_list)
2293 SLJIT_FREE(compiler_common.range_jump_list, NULL);
2294 if (compiler_common.compiler)
2295 sljit_free_compiler(compiler_common.compiler);
2296 if (done)
2297 return compiler_common.machine;
2299 if (compiler_common.machine) {
2300 SLJIT_FREE(compiler_common.machine, NULL);
2302 if (error)
2303 *error = REGEX_MEMORY_ERROR;
2304 return NULL;
2307 #undef TERM_OFFSET_OF
2308 #undef EMIT_OP1
2309 #undef EMIT_OP2
2310 #undef EMIT_LABEL
2311 #undef EMIT_JUMP
2312 #undef EMIT_CMP
2313 #undef BEGIN_GUARD
2314 #undef END_GUARD
2315 #undef CHECK
2317 void regex_free_machine(struct regex_machine *machine)
2319 sljit_free_code(machine->continue_match, NULL);
2320 SLJIT_FREE(machine, NULL);
2323 const char* regex_get_platform_name(void)
2325 return sljit_get_platform_name();
2328 /* --------------------------------------------------------------------- */
2329 /* Mathching utilities */
2330 /* --------------------------------------------------------------------- */
2332 struct regex_match* regex_begin_match(struct regex_machine *machine)
2334 sljit_sw *ptr1;
2335 sljit_sw *ptr2;
2336 sljit_sw *end;
2337 sljit_sw *entry_addrs;
2339 struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (sljit_uw)(machine->size * 2 - 1) * sizeof(sljit_sw), NULL);
2340 if (!match)
2341 return NULL;
2343 ptr1 = match->states;
2344 ptr2 = match->states + machine->size;
2345 end = ptr2;
2346 entry_addrs = (sljit_sw*)machine->entry_addrs;
2348 match->current = ptr1;
2349 match->next = ptr2;
2350 match->head = 0;
2351 match->machine = machine;
2353 /* Init machine states. */
2354 switch (machine->no_states) {
2355 case 2:
2356 while (ptr1 < end) {
2357 *ptr1++ = *entry_addrs;
2358 *ptr2++ = *entry_addrs++;
2359 *ptr1++ = -1;
2360 *ptr2++ = -1;
2362 break;
2364 case 3:
2365 while (ptr1 < end) {
2366 *ptr1++ = *entry_addrs;
2367 *ptr2++ = *entry_addrs++;
2368 *ptr1++ = -1;
2369 *ptr2++ = -1;
2370 *ptr1++ = 0;
2371 *ptr2++ = 0;
2373 break;
2375 case 4:
2376 while (ptr1 < end) {
2377 *ptr1++ = *entry_addrs;
2378 *ptr2++ = *entry_addrs++;
2379 *ptr1++ = -1;
2380 *ptr2++ = -1;
2381 *ptr1++ = 0;
2382 *ptr2++ = 0;
2383 *ptr1++ = 0;
2384 *ptr2++ = 0;
2386 break;
2388 default:
2389 SLJIT_UNREACHABLE();
2390 break;
2393 SLJIT_ASSERT(ptr1 == end);
2395 match->u.continue_match = machine->continue_match;
2397 regex_reset_match(match);
2398 return match;
2401 void regex_reset_match(struct regex_match *match)
2403 struct regex_machine *machine = match->machine;
2404 sljit_sw current, ind;
2405 sljit_sw *current_ptr;
2407 match->best_end = 0;
2408 match->fast_quit = 0;
2409 match->fast_forward = 0;
2411 if (match->head != 0) {
2412 /* Clear the current state. */
2413 current = match->head;
2414 current_ptr = match->current;
2415 do {
2416 ind = (current / (sljit_sw)sizeof(sljit_sw)) + 1;
2417 current = current_ptr[ind];
2418 current_ptr[ind] = -1;
2419 } while (current != 0);
2421 match->head = machine->u.call_init(match->current, match);
2424 void regex_free_match(struct regex_match *match)
2426 SLJIT_FREE(match, NULL);
2429 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length)
2431 match->u.call_continue(match, input_string, length);
2434 int regex_get_result(struct regex_match *match, int *end, int *id)
2436 int flags = match->machine->flags;
2437 sljit_sw no_states;
2439 *end = (int)match->best_end;
2440 *id = (int)match->best_id;
2441 if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END)))
2442 return (int)match->best_begin;
2444 if (flags & REGEX_FAKE_MATCH_END) {
2445 SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END)));
2446 if (match->best_begin != -1)
2447 return (int)match->best_begin;
2449 no_states = match->machine->no_states;
2450 if (match->current[no_states + 1] == -1)
2451 return -1;
2452 if (flags & REGEX_ID_CHECK)
2453 *id = (int)match->current[no_states + 3];
2454 if (!(flags & REGEX_FAKE_MATCH_BEGIN))
2455 *end = (int)match->index - 1;
2456 else
2457 *end = (int)match->index - 2;
2458 return (int)match->current[no_states + 2];
2460 else {
2461 /* Check the status of the last code. */
2462 if (!(flags & REGEX_MATCH_BEGIN)) {
2463 /* No shortcut in this case. */
2464 if (!(flags & REGEX_ID_CHECK)) {
2465 if (match->current[1] == -1)
2466 return -1;
2467 *end = (int)match->index - 1;
2468 return (int)match->current[2];
2471 if (match->current[1] == -1)
2472 return -1;
2473 *end = (int)match->index - 1;
2474 *id = (int)match->current[3];
2475 return (int)match->current[2];
2478 /* Shortcut is possible in this case. */
2479 if (!(flags & REGEX_ID_CHECK)) {
2480 if (match->current[1] == -1 || match->head == -1)
2481 return -1;
2482 *end = (int)match->index - 1;
2483 return 0;
2486 if (match->current[1] == -1 || match->head == -1)
2487 return -1;
2488 *end = (int)match->index - 1;
2489 *id = (int)match->current[2];
2490 return 0;
2494 int regex_is_match_finished(struct regex_match *match)
2496 return (int)match->fast_quit;
2499 #ifdef REGEX_MATCH_VERBOSE
2500 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length)
2502 sljit_sw *ptr;
2503 sljit_sw *end;
2504 sljit_sw count;
2505 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2506 sljit_sw current;
2507 #endif
2508 sljit_sw no_states = match->machine->no_states;
2509 sljit_sw len = match->machine->size;
2511 while (length > 0) {
2512 match->u.call_continue(match, input_string, 1);
2514 if (match->fast_forward) {
2515 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2516 printf("fast forward\n");
2519 /* Verbose (first). */
2520 if (match->machine->flags & REGEX_MATCH_VERBOSE) {
2521 ptr = match->current;
2522 end = ptr + len;
2523 count = 0;
2524 printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id);
2525 while (ptr < end) {
2526 printf("[%3ld:", (long)count++);
2527 switch (no_states) {
2528 case 2:
2529 if (ptr[1] != -1)
2530 printf("+] ");
2531 else
2532 printf(" ] ");
2533 break;
2535 case 3:
2536 if (ptr[1] != -1)
2537 printf("+,%3ld] ", (long)ptr[2]);
2538 else
2539 printf(" ,XXX] ");
2540 break;
2542 case 4:
2543 if (ptr[1] != -1)
2544 printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]);
2545 else
2546 printf(" ,XXX,XXX] ");
2547 break;
2549 ptr += no_states;
2551 printf("\n");
2554 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2555 /* Sanity check (later). */
2556 ptr = match->next;
2557 end = ptr + len;
2558 while (ptr < end) {
2559 SLJIT_ASSERT(ptr[1] == -1);
2560 ptr += no_states;
2563 /* Check number of active elements. */
2564 ptr = match->current + no_states;
2565 end = ptr + len - no_states;
2566 count = 0;
2567 while (ptr < end) {
2568 if (ptr[1] != -1)
2569 count++;
2570 ptr += no_states;
2573 /* Check chain list. */
2574 current = match->head;
2575 ptr = match->current;
2576 while (current != 0) {
2577 SLJIT_ASSERT(current >= 0 && current < len * (sljit_sw)sizeof(sljit_sw));
2578 SLJIT_ASSERT((current % (no_states * (sljit_sw)sizeof(sljit_sw))) == 0);
2579 SLJIT_ASSERT(count > 0);
2580 current = ptr[(current / (sljit_sw)sizeof(sljit_sw)) + 1];
2581 count--;
2583 SLJIT_ASSERT(count == 0);
2584 #endif
2586 if (match->fast_quit) {
2587 /* the machine has stopped working. */
2588 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2589 printf("Best match has found\n");
2590 break;
2593 input_string++;
2594 length--;
2597 #endif