Rework function argument list descriptor macros.
[sljit.git] / regex_src / regexJIT.c
blobc4f133023c0b29dd8402a4fa41964fde22d74f79
1 /*
2 * Stack-less Just-In-Time compiler
4 * Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
6 * Redistribution and use in source and binary forms, with or without modification, are
7 * permitted provided that the following conditions are met:
9 * 1. Redistributions of source code must retain the above copyright notice, this list of
10 * conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice, this list
13 * of conditions and the following disclaimer in the documentation and/or other materials
14 * provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "sljitLir.h"
28 #include "regexJIT.h"
30 #include <stdlib.h>
32 #ifdef REGEX_MATCH_VERBOSE
33 #include <stdio.h>
34 #endif
36 /* Extra, hidden flags:
37 {id!} where id > 0 found in the code. */
38 #define REGEX_ID_CHECK 0x100
39 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search,
40 which starts with [\r\n] character range. */
41 #define REGEX_FAKE_MATCH_BEGIN 0x200
42 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search,
43 which ends with [\r\n] character range. */
44 #define REGEX_FAKE_MATCH_END 0x400
46 /* --------------------------------------------------------------------- */
47 /* Structures for JIT-ed pattern matching */
48 /* --------------------------------------------------------------------- */
50 struct regex_machine
52 /* flags. */
53 int flags;
54 /* Number of state descriptors for one term. */
55 sljit_sw no_states;
56 /* Total size. */
57 sljit_sw size;
59 union {
60 void *init_match;
61 sljit_sw (SLJIT_FUNC *call_init)(void *next, void* match);
62 } u;
63 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
64 struct sljit_function_context context;
65 #endif
67 void *continue_match;
69 /* Variable sized array to contain the handler addresses. */
70 sljit_uw entry_addrs[1];
73 struct regex_match
75 /* Current and next state array. */
76 sljit_sw *current;
77 sljit_sw *next;
78 /* Starting. */
79 sljit_sw head;
80 /* String character index (ever increasing). */
81 sljit_sw index;
82 /* Best match found so far (members in priority order). */
83 sljit_sw best_begin;
84 sljit_sw best_end;
85 sljit_sw best_id;
86 /* Bool flags (encoded as word). */
87 sljit_sw fast_quit;
88 sljit_sw fast_forward;
89 /* Machine. */
90 struct regex_machine *machine;
92 union {
93 void *continue_match;
94 void (SLJIT_FUNC *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length);
95 } u;
97 /* Variable sized array to contain the state arrays. */
98 sljit_sw states[1];
101 /* State vector
102 ITEM[0] - pointer to the address inside the machine code
103 ITEM[1] - next pointer
104 ITEM[2] - string started from (optional)
105 ITEM[3] - max ID (optional) */
107 /* Register allocation. */
108 /* Current state array (loaded & stored: regex_match->current). */
109 #define R_CURR_STATE SLJIT_S0
110 /* Next state array (loaded & stored: regex_match->next). */
111 #define R_NEXT_STATE SLJIT_S1
112 /* Head (loaded & stored: regex_match->head). */
113 #define R_NEXT_HEAD SLJIT_S2
114 /* String fragment pointer. */
115 #define R_STRING SLJIT_S3
116 /* String fragment length. */
117 #define R_LENGTH SLJIT_S4
118 /* 'struct regex_match*' */
119 #define R_REGEX_MATCH SLJIT_R0
120 /* Current character. */
121 #define R_CURR_CHAR SLJIT_R1
122 /* Temporary register. */
123 #define R_TEMP SLJIT_R2
124 /* Caches the regex_match->best_begin. */
125 #define R_BEST_BEGIN SLJIT_R3
126 /* Current character index. */
127 #define R_CURR_INDEX SLJIT_R4
129 /* --------------------------------------------------------------------- */
130 /* Stack management */
131 /* --------------------------------------------------------------------- */
133 /* Try to allocate 2^n blocks. */
134 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item)))
136 struct stack_item {
137 int type;
138 int value;
141 struct stack_fragment_data {
142 struct stack_fragment *next;
143 struct stack_fragment *prev;
146 struct stack_fragment {
147 struct stack_fragment_data data;
148 struct stack_item items[STACK_FRAGMENT_SIZE];
151 struct stack {
152 struct stack_fragment *first;
153 struct stack_fragment *last;
154 int index;
155 int count;
158 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
160 static void stack_check(struct stack *stack)
162 struct stack_fragment *curr;
163 int found;
165 if (!stack)
166 return;
168 SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE);
170 if (stack->first == NULL) {
171 SLJIT_ASSERT(stack->first == NULL && stack->last == NULL);
172 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
173 return;
176 found = 0;
177 if (stack->last == NULL) {
178 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
179 found = 1;
181 else
182 SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0);
184 SLJIT_ASSERT(stack->first->data.prev == NULL);
185 curr = stack->first;
186 while (curr) {
187 if (curr == stack->last)
188 found = 1;
189 if (curr->data.next)
190 SLJIT_ASSERT(curr->data.next->data.prev == curr);
191 curr = curr->data.next;
193 SLJIT_ASSERT(found);
196 #endif
198 static void stack_init(struct stack *stack)
200 stack->first = NULL;
201 stack->last = NULL;
202 stack->index = STACK_FRAGMENT_SIZE - 1;
203 stack->count = 0;
206 static void stack_destroy(struct stack *stack)
208 struct stack_fragment *curr = stack->first;
209 struct stack_fragment *prev;
211 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
212 stack_check(stack);
213 #endif
215 while (curr) {
216 prev = curr;
217 curr = curr->data.next;
218 SLJIT_FREE(prev, NULL);
222 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack)
224 SLJIT_ASSERT(stack->last);
225 return stack->last->items + stack->index;
228 static int stack_push(struct stack *stack, int type, int value)
230 if (stack->last) {
231 stack->index++;
232 if (stack->index >= STACK_FRAGMENT_SIZE) {
233 stack->index = 0;
234 if (!stack->last->data.next) {
235 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
236 if (!stack->last->data.next)
237 return 1;
238 stack->last->data.next->data.next = NULL;
239 stack->last->data.next->data.prev = stack->last;
241 stack->last = stack->last->data.next;
244 else if (!stack->first) {
245 stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
246 if (!stack->last)
247 return 1;
248 stack->last->data.prev = NULL;
249 stack->last->data.next = NULL;
250 stack->first = stack->last;
251 stack->index = 0;
253 else {
254 stack->last = stack->first;
255 stack->index = 0;
257 stack->last->items[stack->index].type = type;
258 stack->last->items[stack->index].value = value;
259 stack->count++;
260 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
261 stack_check(stack);
262 #endif
263 return 0;
266 static struct stack_item* stack_pop(struct stack *stack)
268 struct stack_item *ret = stack_top(stack);
270 if (stack->index > 0)
271 stack->index--;
272 else {
273 stack->last = stack->last->data.prev;
274 stack->index = STACK_FRAGMENT_SIZE - 1;
277 stack->count--;
278 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
279 stack_check(stack);
280 #endif
281 return ret;
284 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst)
286 *dst = *src;
289 static int stack_push_copy(struct stack *stack, int items, int length)
291 struct stack_fragment *frag1;
292 int ind1;
293 struct stack_fragment *frag2;
294 int ind2;
295 int counter;
297 SLJIT_ASSERT(stack->count >= length && items <= length && items > 0);
299 /* Allocate the necessary elements. */
300 counter = items;
301 frag1 = stack->last;
302 ind1 = stack->index;
303 while (counter > 0) {
304 if (stack->index + counter >= STACK_FRAGMENT_SIZE) {
305 counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1;
306 stack->index = 0;
307 if (!stack->last->data.next) {
308 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
309 if (!stack->last->data.next)
310 return 1;
311 stack->last->data.next->data.next = NULL;
312 stack->last->data.next->data.prev = stack->last;
314 stack->last = stack->last->data.next;
316 else {
317 stack->index += counter;
318 counter = 0;
322 frag2 = stack->last;
323 ind2 = stack->index;
324 while (length > 0) {
325 frag2->items[ind2--] = frag1->items[ind1--];
326 if (ind1 < 0) {
327 ind1 = STACK_FRAGMENT_SIZE - 1;
328 frag1 = frag1->data.prev;
330 if (ind2 < 0) {
331 ind2 = STACK_FRAGMENT_SIZE - 1;
332 frag2 = frag2->data.prev;
334 length--;
337 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
338 stack_check(stack);
339 #endif
340 stack->count += items;
341 return 0;
344 /* --------------------------------------------------------------------- */
345 /* Parser */
346 /* --------------------------------------------------------------------- */
348 enum {
349 /* Common. */
350 type_begin,
351 type_end,
352 type_char,
353 type_newline,
354 type_id,
355 type_rng_start,
356 type_rng_end,
357 type_rng_char,
358 type_rng_left,
359 type_rng_right,
361 /* generator only. */
362 type_branch,
363 type_jump,
365 /* Parser only. */
366 type_open_br,
367 type_close_br,
368 type_select,
369 type_asterisk,
370 type_plus_sign,
371 type_qestion_mark
374 struct compiler_common {
375 /* Temporary stacks. */
376 struct stack stack;
377 struct stack depth;
378 /* REGEX_ flags. */
379 int flags;
380 /* Encoded size of the dfa representation. */
381 sljit_sw dfa_size;
382 /* Number of terms. */
383 sljit_sw terms_size;
384 /* Number of state descriptors for one term (same as machine->no_states). */
385 sljit_sw no_states;
386 /* Number of type_rng_(char|left)-s in the longest character range. */
387 sljit_sw longest_range_size;
389 /* DFA linear representation (size: dfa_size). */
390 struct stack_item *dfa_transitions;
391 /* Term id and search state pairs (size: dfa_size). */
392 struct stack_item *search_states;
394 /* sljit compiler */
395 struct sljit_compiler *compiler;
396 /* Machine data, which must be kept for later use. */
397 struct regex_machine *machine;
398 /* Temporary space for jumps (size: longest_range_size). */
399 struct sljit_jump **range_jump_list;
402 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result)
404 int value = 0;
406 SLJIT_ASSERT(length > 0);
407 if (*regex_string < '0' || *regex_string > '9') {
408 *result = -1;
409 return regex_string;
412 while (length > 0 && *regex_string >= '0' && *regex_string <= '9') {
413 value = value * 10 + (*regex_string - '0');
414 length--;
415 regex_string++;
418 *result = value;
419 return regex_string;
422 static int iterate(struct stack *stack, int min, int max)
424 struct stack it;
425 struct stack_item *item;
426 int count = -1;
427 int len = 0;
428 int depth = 0;
430 stack_clone(stack, &it);
432 /* Calculate size. */
433 while (count < 0) {
434 item = stack_pop(&it);
435 switch (item->type) {
436 case type_id:
437 case type_rng_end:
438 case type_rng_char:
439 case type_rng_left:
440 case type_rng_right:
441 case type_plus_sign:
442 case type_qestion_mark:
443 len++;
444 break;
446 case type_asterisk:
447 len += 2;
448 break;
450 case type_close_br:
451 depth++;
452 break;
454 case type_open_br:
455 SLJIT_ASSERT(depth > 0);
456 depth--;
457 if (depth == 0)
458 count = it.count;
459 break;
461 case type_select:
462 SLJIT_ASSERT(depth > 0);
463 len += 2;
464 break;
466 default:
467 SLJIT_ASSERT(item->type != type_begin && item->type != type_end);
468 if (depth == 0)
469 count = it.count;
470 len++;
471 break;
475 if (min == 0 && max == 0) {
476 /* {0,0} case, not {0,} case: delete subtree. */
477 stack_clone(&it, stack);
478 /* and put an empty bracket expression instead of it. */
479 if (stack_push(stack, type_open_br, 0))
480 return REGEX_MEMORY_ERROR;
481 if (stack_push(stack, type_close_br, 0))
482 return REGEX_MEMORY_ERROR;
483 return len;
486 count = stack->count - count;
488 /* Put an open bracket before the sequence. */
489 if (stack_push_copy(stack, 1, count))
490 return -1;
492 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
493 SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0);
494 #else
495 stack_push(&it, type_open_br, 0);
496 #endif
498 /* Copy the data. */
499 if (max > 0) {
500 len = len * (max - 1);
501 max -= min;
502 /* Insert ? operators. */
503 len += max;
505 if (min > 0) {
506 min--;
507 while (min > 0) {
508 if (stack_push_copy(stack, count, count))
509 return -1;
510 min--;
512 if (max > 0) {
513 if (stack_push_copy(stack, count, count))
514 return -1;
515 if (stack_push(stack, type_qestion_mark, 0))
516 return REGEX_MEMORY_ERROR;
517 count++;
518 max--;
521 else {
522 SLJIT_ASSERT(max > 0);
523 max--;
524 count++;
525 if (stack_push(stack, type_qestion_mark, 0))
526 return REGEX_MEMORY_ERROR;
529 while (max > 0) {
530 if (stack_push_copy(stack, count, count))
531 return -1;
532 max--;
535 else {
536 SLJIT_ASSERT(min > 0);
537 min--;
538 /* Insert + operator. */
539 len = len * min + 1;
540 while (min > 0) {
541 if (stack_push_copy(stack, count, count))
542 return -1;
543 min--;
546 if (stack_push(stack, type_plus_sign, 0))
547 return REGEX_MEMORY_ERROR;
550 /* Close the opened bracket. */
551 if (stack_push(stack, type_close_br, 0))
552 return REGEX_MEMORY_ERROR;
554 return len;
557 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_sw *dfa_size, int begin)
559 /* We only know that *regex_string == { . */
560 int val1, val2;
561 const regex_char_t *base_from = regex_string;
562 const regex_char_t *from;
564 length--;
565 regex_string++;
567 /* Decode left value. */
568 val2 = -1;
569 if (length == 0)
570 return -2;
571 if (*regex_string == ',') {
572 val1 = 0;
573 length--;
574 regex_string++;
576 else {
577 from = regex_string;
578 regex_string = decode_number(regex_string, length, &val1);
579 if (val1 < 0)
580 return -2;
581 length -= regex_string - from;
583 if (length == 0)
584 return -2;
585 if (*regex_string == '}') {
586 val2 = val1;
587 if (val1 == 0)
588 val1 = -1;
590 else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') {
591 /* Non posix extension. */
592 if (stack_push(stack, type_id, val1))
593 return -1;
594 (*dfa_size)++;
595 return (regex_string - base_from) + 1;
597 else {
598 if (*regex_string != ',')
599 return -2;
600 length--;
601 regex_string++;
605 if (begin)
606 return -2;
608 /* Decode right value. */
609 if (val2 == -1) {
610 if (length == 0)
611 return -2;
612 if (*regex_string == '}')
613 val2 = 0;
614 else {
615 from = regex_string;
616 regex_string = decode_number(regex_string, length, &val2);
617 length -= regex_string - from;
618 if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1)
619 return -2;
620 if (val2 == 0) {
621 SLJIT_ASSERT(val1 == 0);
622 val1 = -1;
627 /* Fast cases. */
628 if (val1 > 1 || val2 > 1) {
629 val1 = iterate(stack, val1, val2);
630 if (val1 < 0)
631 return -1;
632 *dfa_size += val1;
634 else if (val1 == 0 && val2 == 0) {
635 if (stack_push(stack, type_asterisk, 0))
636 return -1;
637 *dfa_size += 2;
639 else if (val1 == 1 && val2 == 0) {
640 if (stack_push(stack, type_plus_sign, 0))
641 return -1;
642 (*dfa_size)++;
644 else if (val1 == 0 && val2 == 1) {
645 if (stack_push(stack, type_qestion_mark, 0))
646 return -1;
647 (*dfa_size)++;
649 else if (val1 == -1) {
650 val1 = iterate(stack, 0, 0);
651 if (val1 < 0)
652 return -1;
653 *dfa_size -= val1;
654 SLJIT_ASSERT(*dfa_size >= 2);
656 else {
657 /* Ignore. */
658 SLJIT_ASSERT(val1 == 1 && val2 == 1);
660 return regex_string - base_from;
663 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
665 struct stack* stack = &compiler_common->stack;
666 const regex_char_t *base_from = regex_string;
667 regex_char_t left_char, right_char, tmp_char;
669 length--;
670 regex_string++;
672 if (length == 0)
673 return -2;
675 if (*regex_string != '^') {
676 if (stack_push(stack, type_rng_start, 0))
677 return -1;
679 else {
680 length--;
681 regex_string++;
683 if (length == 0)
684 return -2;
686 if (stack_push(stack, type_rng_start, 1))
687 return -1;
689 /* For both the type_rng_start & type_rng_end. */
690 compiler_common->dfa_size += 2;
692 /* Range must be at least 1 character. */
693 if (*regex_string == ']') {
694 length--;
695 regex_string++;
696 if (stack_push(stack, type_rng_char, ']'))
697 return -1;
698 compiler_common->dfa_size++;
701 while (1) {
702 if (length == 0)
703 return -2;
705 if (*regex_string == ']')
706 break;
708 if (*regex_string != '\\')
709 left_char = *regex_string;
710 else {
711 regex_string++;
712 length--;
713 if (length == 0)
714 return -2;
715 left_char = *regex_string;
717 regex_string++;
718 length--;
720 /* Is a range here? */
721 if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') {
722 regex_string++;
723 length--;
725 if (*regex_string != '\\')
726 right_char = *regex_string;
727 else {
728 regex_string++;
729 length--;
730 if (length == 0)
731 return -2;
732 right_char = *regex_string;
734 regex_string++;
735 length--;
737 if (left_char > right_char) {
738 /* Swap if necessary. */
739 tmp_char = left_char;
740 left_char = right_char;
741 right_char = tmp_char;
744 if (stack_push(stack, type_rng_left, left_char))
745 return -1;
746 if (stack_push(stack, type_rng_right, right_char))
747 return -1;
748 compiler_common->dfa_size += 2;
750 else {
751 if (stack_push(stack, type_rng_char, left_char))
752 return -1;
753 compiler_common->dfa_size++;
757 if (stack_push(stack, type_rng_end, 0))
758 return -1;
759 return regex_string - base_from;
762 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
764 /* Depth of bracketed expressions. */
765 int depth = 0;
766 /* Have we already found a term? '1' if not yet. */
767 int begin = 1;
768 /* Cache stack pointer. */
769 struct stack* stack = &compiler_common->stack;
770 int tmp;
772 /* Type_begin and type_end. */
773 compiler_common->dfa_size = 2;
774 stack_init(stack);
775 if (stack_push(stack, type_begin, 0))
776 return REGEX_MEMORY_ERROR;
778 if (length > 0 && *regex_string == '^') {
779 compiler_common->flags |= REGEX_MATCH_BEGIN;
780 length--;
781 regex_string++;
784 if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) {
785 /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */
786 compiler_common->flags &= ~REGEX_MATCH_BEGIN;
787 compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN;
788 /* and append a new-line search. */
789 if (stack_push(stack, type_newline, 0))
790 return REGEX_MEMORY_ERROR;
791 compiler_common->dfa_size++;
792 /* Begin intentionally kept as 1. */
795 while (length > 0) {
796 switch (*regex_string) {
797 case '\\' :
798 length--;
799 regex_string++;
800 if (length == 0)
801 return REGEX_INVALID_REGEX;
802 if (stack_push(stack, type_char, *regex_string))
803 return REGEX_MEMORY_ERROR;
804 begin = 0;
805 compiler_common->dfa_size++;
806 break;
808 case '.' :
809 if (stack_push(stack, type_rng_start, 1))
810 return REGEX_MEMORY_ERROR;
811 if (compiler_common->flags & REGEX_NEWLINE) {
812 if (stack_push(stack, type_rng_char, '\n'))
813 return REGEX_MEMORY_ERROR;
814 if (stack_push(stack, type_rng_char, '\r'))
815 return REGEX_MEMORY_ERROR;
816 compiler_common->dfa_size += 2;
818 if (stack_push(stack, type_rng_end, 1))
819 return REGEX_MEMORY_ERROR;
820 begin = 0;
821 compiler_common->dfa_size += 2;
822 break;
824 case '(' :
825 depth++;
826 if (stack_push(stack, type_open_br, 0))
827 return REGEX_MEMORY_ERROR;
828 begin = 1;
829 break;
831 case ')' :
832 if (depth == 0)
833 return REGEX_INVALID_REGEX;
834 depth--;
835 if (stack_push(stack, type_close_br, 0))
836 return REGEX_MEMORY_ERROR;
837 begin = 0;
838 break;
840 case '|' :
841 if (stack_push(stack, type_select, 0))
842 return REGEX_MEMORY_ERROR;
843 begin = 1;
844 compiler_common->dfa_size += 2;
845 break;
847 case '*' :
848 if (begin)
849 return REGEX_INVALID_REGEX;
850 if (stack_push(stack, type_asterisk, 0))
851 return REGEX_MEMORY_ERROR;
852 compiler_common->dfa_size += 2;
853 break;
855 case '?' :
856 case '+' :
857 if (begin)
858 return REGEX_INVALID_REGEX;
859 if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0))
860 return REGEX_MEMORY_ERROR;
861 compiler_common->dfa_size++;
862 break;
864 case '{' :
865 tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin);
867 if (tmp >= 0) {
868 length -= tmp;
869 regex_string += tmp;
871 else if (tmp == -1)
872 return REGEX_MEMORY_ERROR;
873 else {
874 /* Not a valid range expression. */
875 SLJIT_ASSERT(tmp == -2);
876 if (stack_push(stack, type_char, '{'))
877 return REGEX_MEMORY_ERROR;
878 compiler_common->dfa_size++;
880 break;
882 case '[' :
883 tmp = parse_char_range(regex_string, length, compiler_common);
884 if (tmp >= 0) {
885 length -= tmp;
886 regex_string += tmp;
888 else if (tmp == -1)
889 return REGEX_MEMORY_ERROR;
890 else {
891 SLJIT_ASSERT(tmp == -2);
892 return REGEX_INVALID_REGEX;
894 begin = 0;
895 break;
897 default:
898 if (length == 1 && *regex_string == '$') {
899 compiler_common->flags |= REGEX_MATCH_END;
900 break;
902 if (stack_push(stack, type_char, *regex_string))
903 return REGEX_MEMORY_ERROR;
904 begin = 0;
905 compiler_common->dfa_size++;
906 break;
908 length--;
909 regex_string++;
912 if (depth != 0)
913 return REGEX_INVALID_REGEX;
915 if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) {
916 /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */
917 compiler_common->flags &= ~REGEX_MATCH_END;
918 compiler_common->flags |= REGEX_FAKE_MATCH_END;
919 /* and append a new-line search. */
920 if (stack_push(stack, type_newline, 1))
921 return REGEX_MEMORY_ERROR;
922 compiler_common->dfa_size++;
923 /* Begin intentionally kept as 1. */
926 if (stack_push(stack, type_end, 0))
927 return REGEX_MEMORY_ERROR;
929 return REGEX_NO_ERROR;
932 /* --------------------------------------------------------------------- */
933 /* Generating machine state transitions */
934 /* --------------------------------------------------------------------- */
936 #define PUT_TRANSITION(typ, val) \
937 do { \
938 --transitions_ptr; \
939 transitions_ptr->type = typ; \
940 transitions_ptr->value = val; \
941 } while (0)
943 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth)
945 struct stack_item *item;
947 while (1) {
948 item = stack_top(depth);
950 switch (item->type) {
951 case type_asterisk:
952 SLJIT_ASSERT(transitions[item->value].type == type_branch);
953 transitions[item->value].value = transitions_ptr - transitions;
954 PUT_TRANSITION(type_branch, item->value + 1);
955 break;
957 case type_plus_sign:
958 SLJIT_ASSERT(transitions[item->value].type == type_branch);
959 transitions[item->value].value = transitions_ptr - transitions;
960 break;
962 case type_qestion_mark:
963 PUT_TRANSITION(type_branch, item->value);
964 break;
966 default:
967 return transitions_ptr;
969 stack_pop(depth);
973 static int generate_transitions(struct compiler_common *compiler_common)
975 struct stack *stack = &compiler_common->stack;
976 struct stack *depth = &compiler_common->depth;
977 struct stack_item *transitions_ptr;
978 struct stack_item *item;
980 stack_init(depth);
981 compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
982 if (!compiler_common->dfa_transitions)
983 return REGEX_MEMORY_ERROR;
985 /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */
986 transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size;
987 while (stack->count > 0) {
988 item = stack_pop(stack);
989 switch (item->type) {
990 case type_begin:
991 case type_open_br:
992 item = stack_pop(depth);
993 if (item->type == type_select)
994 PUT_TRANSITION(type_branch, item->value + 1);
995 else
996 SLJIT_ASSERT(item->type == type_close_br);
997 if (stack->count == 0)
998 PUT_TRANSITION(type_begin, 0);
999 else
1000 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1001 break;
1003 case type_end:
1004 case type_close_br:
1005 if (item->type == type_end)
1006 *--transitions_ptr = *item;
1007 if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions))
1008 return REGEX_MEMORY_ERROR;
1009 break;
1011 case type_select:
1012 item = stack_top(depth);
1013 if (item->type == type_select) {
1014 SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump);
1015 PUT_TRANSITION(type_branch, item->value + 1);
1016 PUT_TRANSITION(type_jump, item->value);
1017 item->value = transitions_ptr - compiler_common->dfa_transitions;
1019 else {
1020 SLJIT_ASSERT(item->type == type_close_br);
1021 item->type = type_select;
1022 PUT_TRANSITION(type_jump, item->value);
1023 item->value = transitions_ptr - compiler_common->dfa_transitions;
1025 break;
1027 case type_asterisk:
1028 case type_plus_sign:
1029 case type_qestion_mark:
1030 if (item->type != type_qestion_mark)
1031 PUT_TRANSITION(type_branch, 0);
1032 if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions))
1033 return REGEX_MEMORY_ERROR;
1034 break;
1036 case type_char:
1037 case type_newline:
1038 case type_rng_start:
1039 /* Requires handle_iteratives. */
1040 *--transitions_ptr = *item;
1041 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1042 break;
1044 default:
1045 *--transitions_ptr = *item;
1046 break;
1050 SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr);
1051 SLJIT_ASSERT(depth->count == 0);
1052 return REGEX_NO_ERROR;
1055 #undef PUT_TRANSITION
1057 #ifdef REGEX_MATCH_VERBOSE
1059 static void verbose_transitions(struct compiler_common *compiler_common)
1061 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1062 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1063 struct stack_item *search_states_ptr = compiler_common->search_states;
1064 int pos;
1066 printf("-----------------\nTransitions\n-----------------\n");
1067 pos = 0;
1068 while (transitions_ptr < transitions_end) {
1069 printf("[%3d] ", pos++);
1070 if (search_states_ptr->type >= 0)
1071 printf("(%3d) ", search_states_ptr->type);
1072 switch (transitions_ptr->type) {
1073 case type_begin:
1074 printf("type_begin\n");
1075 break;
1077 case type_end:
1078 printf("type_end\n");
1079 break;
1081 case type_char:
1082 if (transitions_ptr->value >= ' ')
1083 printf("type_char '%c'\n", transitions_ptr->value);
1084 else
1085 printf("type_char 0x%x\n", transitions_ptr->value);
1086 break;
1088 case type_newline:
1089 printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)");
1090 break;
1092 case type_id:
1093 printf("type_id %d\n", transitions_ptr->value);
1094 break;
1096 case type_rng_start:
1097 printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)");
1098 break;
1100 case type_rng_end:
1101 printf("type_rng_end\n");
1102 break;
1104 case type_rng_char:
1105 if (transitions_ptr->value >= ' ')
1106 printf("type_rng_char '%c'\n", transitions_ptr->value);
1107 else
1108 printf("type_rng_char 0x%x\n", transitions_ptr->value);
1109 break;
1111 case type_rng_left:
1112 if (transitions_ptr->value >= ' ')
1113 printf("type_rng_left '%c'\n", transitions_ptr->value);
1114 else
1115 printf("type_rng_left 0x%x\n", transitions_ptr->value);
1116 break;
1118 case type_rng_right:
1119 if (transitions_ptr->value >= ' ')
1120 printf("type_rng_right '%c'\n", transitions_ptr->value);
1121 else
1122 printf("type_rng_right 0x%x\n", transitions_ptr->value);
1123 break;
1125 case type_branch:
1126 printf("type_branch -> %d\n", transitions_ptr->value);
1127 break;
1129 case type_jump:
1130 printf("type_jump -> %d\n", transitions_ptr->value);
1131 break;
1133 default:
1134 printf("UNEXPECTED TYPE\n");
1135 break;
1137 transitions_ptr++;
1138 search_states_ptr++;
1140 printf("flags: ");
1141 if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)))
1142 printf("none ");
1143 if (compiler_common->flags & REGEX_MATCH_BEGIN)
1144 printf("REGEX_MATCH_BEGIN ");
1145 if (compiler_common->flags & REGEX_MATCH_END)
1146 printf("REGEX_MATCH_END ");
1147 if (compiler_common->flags & REGEX_NEWLINE)
1148 printf("REGEX_NEWLINE ");
1149 if (compiler_common->flags & REGEX_ID_CHECK)
1150 printf("REGEX_ID_CHECK ");
1151 if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)
1152 printf("REGEX_FAKE_MATCH_BEGIN ");
1153 if (compiler_common->flags & REGEX_FAKE_MATCH_END)
1154 printf("REGEX_FAKE_MATCH_END ");
1155 if (compiler_common->longest_range_size > 0)
1156 printf("(longest range: %ld) ", (long)compiler_common->longest_range_size);
1157 printf("\n");
1160 #endif
1162 /* --------------------------------------------------------------------- */
1163 /* Utilities */
1164 /* --------------------------------------------------------------------- */
1166 static int generate_search_states(struct compiler_common *compiler_common)
1168 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1169 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1170 struct stack_item *search_states_ptr;
1171 struct stack_item *rng_start = NULL;
1173 compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2;
1174 compiler_common->longest_range_size = 0;
1175 compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
1176 if (!compiler_common->search_states)
1177 return REGEX_MEMORY_ERROR;
1179 search_states_ptr = compiler_common->search_states;
1180 while (transitions_ptr < transitions_end) {
1181 switch (transitions_ptr->type) {
1182 case type_begin:
1183 case type_end:
1184 search_states_ptr->type = 0;
1185 break;
1187 case type_char:
1188 search_states_ptr->type = compiler_common->terms_size++;
1189 break;
1191 case type_newline:
1192 if (transitions_ptr->value)
1193 search_states_ptr->type = 1;
1194 else
1195 search_states_ptr->type = compiler_common->terms_size++;
1196 SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2);
1197 break;
1199 case type_id:
1200 if (transitions_ptr->value > 0)
1201 compiler_common->flags |= REGEX_ID_CHECK;
1202 search_states_ptr->type = -1;
1203 break;
1205 case type_rng_start:
1206 search_states_ptr->type = compiler_common->terms_size;
1207 rng_start = search_states_ptr;
1208 break;
1210 case type_rng_end:
1211 search_states_ptr->type = compiler_common->terms_size++;
1212 /* Ok, this is a blunt over estimation :) */
1213 if (compiler_common->longest_range_size < search_states_ptr - rng_start)
1214 compiler_common->longest_range_size = search_states_ptr - rng_start;
1215 break;
1217 default:
1218 search_states_ptr->type = -1;
1219 break;
1221 search_states_ptr->value = -1;
1222 search_states_ptr++;
1223 transitions_ptr++;
1225 return REGEX_NO_ERROR;
1228 static int trace_transitions(int from, struct compiler_common *compiler_common)
1230 int id = 0;
1231 struct stack *stack = &compiler_common->stack;
1232 struct stack *depth = &compiler_common->depth;
1233 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1234 struct stack_item *search_states = compiler_common->search_states;
1236 SLJIT_ASSERT(search_states[from].type >= 0);
1238 from++;
1240 /* Be prepared for any paths (loops, etc). */
1241 while (1) {
1242 if (dfa_transitions[from].type == type_id)
1243 if (id < dfa_transitions[from].value)
1244 id = dfa_transitions[from].value;
1246 if (search_states[from].value < id) {
1247 /* Forward step. */
1248 if (search_states[from].value == -1)
1249 if (stack_push(stack, 0, from))
1250 return REGEX_MEMORY_ERROR;
1251 search_states[from].value = id;
1253 if (dfa_transitions[from].type == type_branch) {
1254 if (stack_push(depth, id, from))
1255 return REGEX_MEMORY_ERROR;
1256 from++;
1257 continue;
1259 else if (dfa_transitions[from].type == type_jump) {
1260 from = dfa_transitions[from].value;
1261 continue;
1263 else if (search_states[from].type < 0) {
1264 from++;
1265 continue;
1269 /* Back tracking. */
1270 if (depth->count > 0) {
1271 id = stack_top(depth)->type;
1272 from = dfa_transitions[stack_pop(depth)->value].value;
1273 continue;
1275 return 0;
1279 /* --------------------------------------------------------------------- */
1280 /* Code generator */
1281 /* --------------------------------------------------------------------- */
1283 #define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * sizeof(sljit_sw))
1284 #define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * sizeof(sljit_sw)))
1286 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1287 CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4))
1289 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1290 CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1292 #define EMIT_OP2U(type, arg1, arg2, arg3, arg4) \
1293 CHECK(sljit_emit_op2u(compiler, type, arg1, arg2, arg3, arg4))
1295 #define EMIT_LABEL(label) \
1296 label = sljit_emit_label(compiler); \
1297 CHECK(!label)
1299 #define EMIT_JUMP(jump, type) \
1300 jump = sljit_emit_jump(compiler, type); \
1301 CHECK(!jump)
1303 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1304 jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \
1305 CHECK(!jump)
1307 /* CHECK depends on the use case. */
1309 #define CHECK(exp) \
1310 if (SLJIT_UNLIKELY(exp)) \
1311 return REGEX_MEMORY_ERROR
1313 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg)
1315 struct sljit_compiler *compiler = compiler_common->compiler;
1316 struct stack *stack = &compiler_common->stack;
1317 struct stack_item *search_states = compiler_common->search_states;
1318 int flags = compiler_common->flags;
1319 sljit_sw no_states = compiler_common->no_states;
1320 sljit_uw head = 0;
1321 sljit_sw offset, value;
1323 if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) {
1324 CHECK(trace_transitions(0, compiler_common));
1326 else {
1327 CHECK(trace_transitions(1, compiler_common));
1330 while (stack->count > 0) {
1331 value = stack_pop(stack)->value;
1332 if (search_states[value].type >= 0) {
1333 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1334 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1335 if (offset > 0)
1336 head = offset;
1338 if (!(flags & REGEX_MATCH_BEGIN)) {
1339 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0);
1340 if (flags & REGEX_ID_CHECK) {
1341 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value);
1344 else if (flags & REGEX_ID_CHECK) {
1345 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value);
1348 search_states[value].value = -1;
1350 if (reg == R_NEXT_STATE) {
1351 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1353 else if (flags & REGEX_FAKE_MATCH_BEGIN) {
1354 SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value);
1355 offset = TERM_OFFSET_OF(search_states[1].type, 0);
1357 SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN));
1359 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1360 head = offset;
1362 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1);
1363 if (flags & REGEX_ID_CHECK) {
1364 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0);
1367 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head);
1368 return REGEX_NO_ERROR;
1371 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index)
1373 struct sljit_compiler *compiler = compiler_common->compiler;
1374 struct stack *stack = &compiler_common->stack;
1375 struct stack_item *search_states = compiler_common->search_states;
1376 sljit_sw offset;
1377 int flags;
1378 sljit_sw no_states;
1379 sljit_sw value;
1380 struct sljit_jump *jump1;
1381 struct sljit_jump *jump2;
1382 struct sljit_jump *jump3;
1383 struct sljit_jump *jump4;
1384 struct sljit_jump *jump5;
1385 struct sljit_label *label1;
1387 flags = compiler_common->flags;
1388 no_states = compiler_common->no_states;
1390 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
1391 if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) {
1392 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1395 while (stack->count > 0) {
1396 value = stack_pop(stack)->value;
1397 if (search_states[value].type >= 0) {
1398 #ifdef REGEX_MATCH_VERBOSE
1399 if (flags & REGEX_MATCH_VERBOSE)
1400 printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value);
1401 #endif
1402 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1404 if (!(flags & REGEX_ID_CHECK)) {
1405 if (!(flags & REGEX_MATCH_BEGIN)) {
1406 /* Check whether item is inserted. */
1407 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1408 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1409 if (offset > 0) {
1410 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1412 EMIT_JUMP(jump2, SLJIT_JUMP);
1414 /* Check whether old index <= index. */
1415 EMIT_LABEL(label1);
1416 sljit_set_label(jump1, label1);
1418 EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1420 EMIT_LABEL(label1);
1421 sljit_set_label(jump2, label1);
1422 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1424 EMIT_LABEL(label1);
1425 sljit_set_label(jump1, label1);
1427 else {
1428 /* Check whether item is inserted. */
1429 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1430 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1431 if (offset > 0) {
1432 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1434 EMIT_LABEL(label1);
1435 sljit_set_label(jump1, label1);
1438 else {
1439 if (!(flags & REGEX_MATCH_BEGIN)) {
1440 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1442 /* Check whether item is inserted. */
1443 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1444 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1445 if (offset > 0) {
1446 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1448 EMIT_JUMP(jump2, SLJIT_JUMP);
1450 /* Check whether old index != index. */
1451 EMIT_LABEL(label1);
1452 sljit_set_label(jump1, label1);
1454 EMIT_OP2U(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1455 EMIT_JUMP(jump1, SLJIT_LESS);
1456 EMIT_JUMP(jump3, SLJIT_NOT_EQUAL); /* Greater. */
1458 /* Old index == index. */
1459 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1460 if (search_states[value].value > 0) {
1461 EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1463 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1464 EMIT_LABEL(label1);
1465 sljit_set_label(jump4, label1);
1468 EMIT_OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
1469 EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL);
1470 EMIT_JUMP(jump5, SLJIT_JUMP);
1472 /* Overwrite index & id. */
1473 EMIT_LABEL(label1);
1474 sljit_set_label(jump3, label1);
1475 sljit_set_label(jump2, label1);
1476 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1478 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1479 if (search_states[value].value > 0) {
1480 EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1482 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1483 EMIT_LABEL(label1);
1484 sljit_set_label(jump3, label1);
1487 EMIT_LABEL(label1);
1488 sljit_set_label(jump5, label1);
1489 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
1491 /* Exit. */
1492 EMIT_LABEL(label1);
1493 sljit_set_label(jump1, label1);
1494 sljit_set_label(jump4, label1);
1496 else {
1497 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1499 if (search_states[value].value > 0) {
1500 EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1502 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1503 EMIT_LABEL(label1);
1504 sljit_set_label(jump1, label1);
1507 /* Check whether item is inserted. */
1508 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1509 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1510 if (offset > 0) {
1511 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1513 EMIT_JUMP(jump2, SLJIT_JUMP);
1515 /* Check whether old id >= id. */
1516 EMIT_LABEL(label1);
1517 sljit_set_label(jump1, label1);
1519 EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1521 EMIT_LABEL(label1);
1522 sljit_set_label(jump2, label1);
1523 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1525 EMIT_LABEL(label1);
1526 sljit_set_label(jump1, label1);
1530 search_states[value].value = -1;
1533 #ifdef REGEX_MATCH_VERBOSE
1534 if (flags & REGEX_MATCH_VERBOSE)
1535 printf("\n");
1536 #endif
1537 return REGEX_NO_ERROR;
1540 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label)
1542 struct sljit_compiler *compiler = compiler_common->compiler;
1543 struct sljit_jump *jump;
1544 struct sljit_jump *clear_states_jump;
1545 struct sljit_label *label;
1546 struct sljit_label *leave_label;
1547 struct sljit_label *begin_loop_label;
1549 /* Priority order: best_begin > best_end > best_id.
1550 In other words:
1551 if (new best_begin > old test_begin) do nothing
1552 otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing
1553 therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */
1555 /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */
1557 if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) {
1558 EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1559 EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1560 sljit_set_label(jump, end_check_label);
1562 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1563 if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) {
1564 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1566 else {
1567 if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) {
1568 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2);
1570 else {
1571 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1);
1574 if (compiler_common->flags & REGEX_ID_CHECK) {
1575 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3));
1578 EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0);
1580 EMIT_LABEL(leave_label);
1581 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0);
1582 EMIT_JUMP(jump, SLJIT_JUMP);
1583 sljit_set_label(jump, end_check_label);
1585 /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */
1586 EMIT_LABEL(label);
1587 sljit_set_label(clear_states_jump, label);
1589 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1590 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
1592 /* Begin of the loop. */
1593 EMIT_LABEL(begin_loop_label);
1594 EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0);
1595 sljit_set_label(jump, leave_label);
1597 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0);
1598 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw));
1599 EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0);
1601 /* Case 1: keep this case. */
1602 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0);
1603 EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0);
1605 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1606 EMIT_JUMP(jump, SLJIT_JUMP);
1607 sljit_set_label(jump, begin_loop_label);
1609 /* Case 2: remove this case. */
1610 EMIT_LABEL(label);
1611 sljit_set_label(clear_states_jump, label);
1613 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1);
1615 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1616 EMIT_JUMP(jump, SLJIT_JUMP);
1617 sljit_set_label(jump, begin_loop_label);
1619 else {
1620 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0);
1621 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
1622 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1623 if (compiler_common->flags & REGEX_ID_CHECK) {
1624 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1626 EMIT_JUMP(jump, SLJIT_JUMP);
1627 sljit_set_label(jump, end_check_label);
1629 return REGEX_NO_ERROR;
1632 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label)
1634 struct sljit_compiler *compiler = compiler_common->compiler;
1635 struct stack *stack = &compiler_common->stack;
1636 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1637 struct stack_item *search_states = compiler_common->search_states;
1638 int ind;
1639 struct sljit_jump *jump;
1640 int init_range = 1, prev_value = 0;
1642 while (stack->count > 0) {
1643 ind = stack_pop(stack)->value;
1644 search_states[ind].value = -1;
1645 if (search_states[ind].type >= 0) {
1646 if (dfa_transitions[ind].type == type_char) {
1647 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1648 sljit_set_label(jump, fast_forward_label);
1650 else if (dfa_transitions[ind].type == type_rng_start) {
1651 SLJIT_ASSERT(!dfa_transitions[ind].value);
1652 ind++;
1653 while (dfa_transitions[ind].type != type_rng_end) {
1654 if (dfa_transitions[ind].type == type_rng_char) {
1655 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1656 sljit_set_label(jump, fast_forward_label);
1658 else {
1659 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1660 if (init_range) {
1661 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1662 init_range = 0;
1664 if (dfa_transitions[ind].value != prev_value) {
1665 /* Best compatibility to all archs. */
1666 prev_value -= dfa_transitions[ind].value;
1667 if (prev_value < 0) {
1668 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1670 else {
1671 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1673 prev_value = dfa_transitions[ind].value;
1675 EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1676 sljit_set_label(jump, fast_forward_label);
1677 ind++;
1679 ind++;
1682 else {
1683 SLJIT_ASSERT(dfa_transitions[ind].type == type_newline);
1684 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1685 sljit_set_label(jump, fast_forward_label);
1686 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1687 sljit_set_label(jump, fast_forward_label);
1691 return REGEX_NO_ERROR;
1694 static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind)
1696 struct sljit_compiler *compiler = compiler_common->compiler;
1697 struct sljit_jump *jump1;
1698 struct sljit_jump *jump2;
1699 struct sljit_label *label;
1700 sljit_sw no_states;
1701 sljit_sw offset;
1703 /* Check whether a new-line character is found. */
1704 EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1705 EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1707 no_states = compiler_common->no_states;
1708 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1709 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1710 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1711 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1713 EMIT_LABEL(label);
1714 sljit_set_label(jump1, label);
1715 sljit_set_label(jump2, label);
1716 return REGEX_NO_ERROR;
1719 #undef CHECK
1721 #define CHECK(exp) \
1722 if (SLJIT_UNLIKELY(exp)) \
1723 return 0
1725 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label)
1727 while (*range_jump_list) {
1728 sljit_set_label(*range_jump_list, label);
1729 range_jump_list++;
1733 static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind)
1735 struct sljit_compiler *compiler = compiler_common->compiler;
1736 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1737 struct sljit_jump **range_jump_list = compiler_common->range_jump_list;
1738 int invert = dfa_transitions[ind].value;
1739 struct sljit_label *label;
1740 sljit_sw no_states;
1741 sljit_sw offset;
1742 int init_range = 1, prev_value = 0;
1744 ind++;
1746 while (dfa_transitions[ind].type != type_rng_end) {
1747 if (dfa_transitions[ind].type == type_rng_char) {
1748 EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1749 range_jump_list++;
1751 else {
1752 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1753 if (init_range) {
1754 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1755 init_range = 0;
1757 if (dfa_transitions[ind].value != prev_value) {
1758 /* Best compatibility to all archs. */
1759 prev_value -= dfa_transitions[ind].value;
1760 if (prev_value < 0) {
1761 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1763 else {
1764 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1766 prev_value = dfa_transitions[ind].value;
1768 EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1769 range_jump_list++;
1770 ind++;
1772 ind++;
1775 *range_jump_list = NULL;
1777 if (!invert) {
1778 no_states = compiler_common->no_states;
1779 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1780 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1781 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1782 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1784 EMIT_LABEL(label);
1785 range_set_label(compiler_common->range_jump_list, label);
1786 /* Clears the jump list. */
1787 *compiler_common->range_jump_list = NULL;
1789 return ind;
1792 #undef TERM_OFFSET_OF
1793 #undef EMIT_OP1
1794 #undef EMIT_OP2
1795 #undef EMIT_LABEL
1796 #undef EMIT_JUMP
1797 #undef EMIT_CMP
1798 #undef CHECK
1800 /* --------------------------------------------------------------------- */
1801 /* Main compiler */
1802 /* --------------------------------------------------------------------- */
1804 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_sw))
1806 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1807 CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4))
1809 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1810 CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1812 #define EMIT_LABEL(label) \
1813 label = sljit_emit_label(compiler_common.compiler); \
1814 CHECK(!label)
1816 #define EMIT_JUMP(jump, type) \
1817 jump = sljit_emit_jump(compiler_common.compiler, type); \
1818 CHECK(!jump)
1820 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1821 jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \
1822 CHECK(!jump)
1824 /* A do {} while(0) expression helps to avoid goto statements. */
1825 #define BEGIN_GUARD \
1826 do {
1828 #define END_GUARD \
1829 } while(0);
1831 #define CHECK(exp) \
1832 if (SLJIT_UNLIKELY(exp)) \
1833 break;
1835 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error)
1837 struct compiler_common compiler_common;
1838 sljit_sw ind;
1839 int error_code, done, suggest_fast_forward;
1840 /* ID of an empty match (-1 if not reachable). */
1841 int empty_match_id;
1843 struct sljit_jump *jump;
1844 struct sljit_jump *best_match_found_jump;
1845 struct sljit_jump *fast_forward_jump = NULL;
1846 struct sljit_jump *length_is_zero_jump;
1847 struct sljit_jump *end_check_jump = NULL;
1848 struct sljit_jump *best_match_check_jump = NULL;
1849 struct sljit_jump *non_greedy_end_jump = NULL;
1850 struct sljit_label *label;
1851 struct sljit_label *end_check_label = NULL;
1852 struct sljit_label *start_label;
1853 struct sljit_label *fast_forward_label;
1854 struct sljit_label *fast_forward_return_label;
1856 if (error)
1857 *error = REGEX_NO_ERROR;
1858 #ifdef REGEX_MATCH_VERBOSE
1859 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE);
1860 #else
1861 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE);
1862 #endif
1864 /* Step 1: parsing (Left->Right).
1865 Syntax check and AST generator. */
1866 error_code = parse(regex_string, length, &compiler_common);
1867 if (error_code) {
1868 stack_destroy(&compiler_common.stack);
1869 if (error)
1870 *error = error_code;
1871 return NULL;
1874 /* Step 2: generating branches (Right->Left). */
1875 error_code = generate_transitions(&compiler_common);
1876 stack_destroy(&compiler_common.stack);
1877 stack_destroy(&compiler_common.depth);
1878 if (error_code) {
1879 if (compiler_common.dfa_transitions)
1880 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1881 if (error)
1882 *error = error_code;
1883 return NULL;
1886 /* Step 3: Generate necessary data for depth-first search (Left->Right). */
1887 error_code = generate_search_states(&compiler_common);
1888 if (error_code) {
1889 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1890 if (error)
1891 *error = error_code;
1892 return NULL;
1895 #ifdef REGEX_MATCH_VERBOSE
1896 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1897 verbose_transitions(&compiler_common);
1898 #endif
1900 /* Step 4: Left->Right generate code. */
1901 stack_init(&compiler_common.stack);
1902 stack_init(&compiler_common.depth);
1903 done = 0;
1904 compiler_common.machine = NULL;
1905 compiler_common.compiler = NULL;
1906 compiler_common.range_jump_list = NULL;
1908 BEGIN_GUARD
1910 compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL);
1911 CHECK(!compiler_common.machine);
1913 compiler_common.compiler = sljit_create_compiler(NULL, NULL);
1914 CHECK(!compiler_common.compiler);
1916 if (compiler_common.longest_range_size > 0) {
1917 compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size, NULL);
1918 CHECK(!compiler_common.range_jump_list);
1921 if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN))
1922 compiler_common.no_states = 4;
1923 else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN))
1924 compiler_common.no_states = 2;
1925 else
1926 compiler_common.no_states = 3;
1928 compiler_common.machine->flags = compiler_common.flags;
1929 compiler_common.machine->no_states = compiler_common.no_states;
1930 compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size;
1932 /* Study the regular expression. */
1933 empty_match_id = -1;
1934 suggest_fast_forward = 1;
1935 if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) {
1936 CHECK(trace_transitions(0, &compiler_common));
1937 while (compiler_common.stack.count > 0) {
1938 ind = stack_pop(&compiler_common.stack)->value;
1939 if (compiler_common.search_states[ind].type == 0) {
1940 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1941 suggest_fast_forward = 0;
1942 empty_match_id = compiler_common.search_states[ind].value;
1944 else if (compiler_common.search_states[ind].type > 0) {
1945 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end);
1946 if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value)
1947 suggest_fast_forward = 0;
1949 compiler_common.search_states[ind].value = -1;
1952 else {
1953 SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline);
1954 CHECK(trace_transitions(1, &compiler_common));
1955 while (compiler_common.stack.count > 0) {
1956 ind = stack_pop(&compiler_common.stack)->value;
1957 if (compiler_common.search_states[ind].type == 0) {
1958 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1959 suggest_fast_forward = 0;
1960 empty_match_id = compiler_common.search_states[ind].value;
1962 compiler_common.search_states[ind].value = -1;
1966 /* Step 4.1: Generate entry. */
1967 CHECK(sljit_emit_enter(compiler_common.compiler, 0, SLJIT_ARGS3(VOID, P, P, 32), 5, 5, 0, 0, 0));
1969 /* Copy arguments to their place. */
1970 EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0);
1971 EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0);
1972 EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1);
1974 /* Init global registers. */
1975 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
1976 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
1977 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
1978 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin));
1979 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index));
1981 /* Check whether the best match has already found in a previous frame. */
1982 EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0);
1983 EMIT_JUMP(best_match_found_jump, SLJIT_JUMP);
1985 #ifdef REGEX_MATCH_VERBOSE
1986 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1987 printf("\n-----------------\nTrace\n-----------------\n");
1988 #endif
1990 /* Step 4.2: Generate code for state 0. */
1991 EMIT_LABEL(label);
1992 sljit_emit_op0(compiler_common.compiler, SLJIT_ENDBR);
1993 compiler_common.machine->entry_addrs[0] = (sljit_uw)label;
1995 /* Swapping current and next. */
1996 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0);
1997 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0);
1998 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0);
2000 /* Checking whether the best case needs to be updated. */
2001 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2002 EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2003 EMIT_LABEL(end_check_label);
2005 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2006 EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1);
2008 /* Checking whether best case has already found. */
2009 if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) {
2010 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2011 /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */
2012 EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2014 else {
2015 /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */
2016 EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2020 EMIT_LABEL(start_label);
2021 sljit_set_label(jump, start_label);
2023 if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) {
2024 EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2027 /* Loading the next character. */
2028 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1);
2029 EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL);
2031 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0);
2032 #ifdef REGEX_USE_8BIT_CHARS
2033 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2034 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1);
2035 #else
2036 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2037 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2);
2038 #endif
2039 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0);
2041 #ifdef REGEX_MATCH_VERBOSE
2042 if (compiler_common.flags & REGEX_MATCH_VERBOSE) {
2043 printf("(%3d): ", 0);
2044 CHECK(trace_transitions(0, &compiler_common));
2045 while (compiler_common.stack.count > 0) {
2046 ind = stack_pop(&compiler_common.stack)->value;
2047 if (compiler_common.search_states[ind].type >= 0)
2048 printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value);
2049 compiler_common.search_states[ind].value = -1;
2051 printf("\n");
2053 #endif
2055 EMIT_LABEL(fast_forward_return_label);
2056 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2057 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1);
2058 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2059 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2062 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0);
2063 CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE));
2064 /* And branching to the first state. */
2065 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2067 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2068 EMIT_LABEL(label);
2069 sljit_set_label(jump, label);
2072 /* This is the case where we only have to reset the R_NEXT_HEAD. */
2073 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
2074 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2075 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2077 /* Fast-forward loop. */
2078 if (fast_forward_jump) {
2079 /* Quit from fast-forward loop. */
2080 EMIT_LABEL(fast_forward_label);
2081 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2082 EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0);
2083 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0);
2084 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0);
2085 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
2086 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
2087 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
2089 /* Update the start field of the locations. */
2090 CHECK(trace_transitions(0, &compiler_common));
2091 while (compiler_common.stack.count > 0) {
2092 ind = stack_pop(&compiler_common.stack)->value;
2093 if (compiler_common.search_states[ind].type >= 0) {
2094 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0);
2096 compiler_common.search_states[ind].value = -1;
2098 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2099 EMIT_JUMP(jump, SLJIT_JUMP);
2100 sljit_set_label(jump, fast_forward_return_label);
2102 /* Start fast-forward. */
2103 EMIT_LABEL(label);
2104 sljit_set_label(fast_forward_jump, label);
2106 /* Moving everything to registers. */
2107 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2108 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2109 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2110 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0);
2111 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0);
2112 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0);
2114 /* Fast forward mainloop. */
2115 EMIT_LABEL(label);
2116 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1);
2117 EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL);
2119 #ifdef REGEX_USE_8BIT_CHARS
2120 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2121 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1);
2122 #else
2123 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2124 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2);
2125 #endif
2127 CHECK(trace_transitions(0, &compiler_common));
2128 CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label));
2130 EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2131 EMIT_JUMP(jump, SLJIT_JUMP);
2132 sljit_set_label(jump, label);
2134 /* String is finished. */
2135 EMIT_LABEL(label);
2136 sljit_set_label(fast_forward_jump, label);
2137 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0);
2138 EMIT_JUMP(fast_forward_jump, SLJIT_JUMP);
2141 /* End check. */
2142 if (end_check_jump) {
2143 EMIT_LABEL(label);
2144 sljit_set_label(end_check_jump, label);
2146 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2147 CHECK(compile_end_check(&compiler_common, end_check_label));
2149 else {
2150 /* Since we leave, we do not need to update the R_BEST_BEGIN. */
2151 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2152 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
2153 if (compiler_common.flags & REGEX_ID_CHECK) {
2154 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
2156 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2157 EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP);
2161 /* Finish check. */
2162 if (best_match_check_jump) {
2163 EMIT_LABEL(label);
2164 sljit_set_label(best_match_check_jump, label);
2166 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2167 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2168 sljit_set_label(jump, start_label);
2170 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2173 /* Leaving matching and storing the necessary values. */
2174 EMIT_LABEL(label);
2175 sljit_set_label(length_is_zero_jump, label);
2176 if (non_greedy_end_jump)
2177 sljit_set_label(non_greedy_end_jump, label);
2179 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0);
2180 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2181 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2182 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2184 /* Exit from JIT. */
2185 EMIT_LABEL(label);
2186 sljit_set_label(best_match_found_jump, label);
2187 if (fast_forward_jump)
2188 sljit_set_label(fast_forward_jump, label);
2189 CHECK(sljit_emit_return_void(compiler_common.compiler));
2191 for (ind = 1; ind < compiler_common.dfa_size - 1; ind++) {
2192 if (compiler_common.search_states[ind].type >= 0) {
2193 SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size);
2194 EMIT_LABEL(label);
2195 sljit_emit_op0(compiler_common.compiler, SLJIT_ENDBR);
2196 compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label;
2198 if (compiler_common.dfa_transitions[ind].type == type_char) {
2199 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value);
2201 else if (compiler_common.dfa_transitions[ind].type == type_rng_start) {
2202 ind = compile_range_check(&compiler_common, ind);
2203 CHECK(!ind);
2205 else {
2206 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2207 CHECK(compile_newline_check(&compiler_common, ind));
2210 CHECK(trace_transitions(ind, &compiler_common));
2211 #ifdef REGEX_MATCH_VERBOSE
2212 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2213 printf("(%3d): ", compiler_common.search_states[ind].type);
2214 #endif
2215 CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type));
2217 if (compiler_common.dfa_transitions[ind].type == type_char) {
2218 EMIT_LABEL(label);
2219 sljit_set_label(jump, label);
2221 else if (compiler_common.dfa_transitions[ind].type == type_rng_end) {
2222 EMIT_LABEL(label);
2223 range_set_label(compiler_common.range_jump_list, label);
2225 else {
2226 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2229 /* Branch to the next item in the list. */
2230 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1));
2231 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1);
2232 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2236 if (ind == compiler_common.dfa_size - 1) {
2237 /* Generate an init stub function. */
2238 EMIT_LABEL(label);
2239 CHECK(sljit_emit_enter(compiler_common.compiler, 0, SLJIT_ARGS2(W, P, P), 3, 3, 0, 0, 0));
2241 if (empty_match_id == -1) {
2242 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1);
2243 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0);
2245 else {
2246 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2247 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id);
2250 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2);
2252 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) {
2253 /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */
2254 SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0);
2255 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2256 /* R_CURR_INDEX (put to R_TEMP) is zero. */
2257 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0);
2259 CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE));
2261 else {
2262 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2264 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0));
2266 compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler);
2267 #ifndef SLJIT_INDIRECT_CALL
2268 compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label);
2269 #else
2270 sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile);
2271 #endif
2272 #ifdef REGEX_MATCH_VERBOSE
2273 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2274 printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match);
2275 #endif
2276 if (compiler_common.machine->continue_match) {
2277 for (ind = 0; ind < compiler_common.terms_size; ++ind)
2278 compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]);
2279 done = 1;
2282 END_GUARD
2284 stack_destroy(&compiler_common.stack);
2285 stack_destroy(&compiler_common.depth);
2286 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
2287 SLJIT_FREE(compiler_common.search_states, NULL);
2288 if (compiler_common.range_jump_list)
2289 SLJIT_FREE(compiler_common.range_jump_list, NULL);
2290 if (compiler_common.compiler)
2291 sljit_free_compiler(compiler_common.compiler);
2292 if (done)
2293 return compiler_common.machine;
2295 if (compiler_common.machine) {
2296 SLJIT_FREE(compiler_common.machine, NULL);
2298 if (error)
2299 *error = REGEX_MEMORY_ERROR;
2300 return NULL;
2303 #undef TERM_OFFSET_OF
2304 #undef EMIT_OP1
2305 #undef EMIT_OP2
2306 #undef EMIT_LABEL
2307 #undef EMIT_JUMP
2308 #undef EMIT_CMP
2309 #undef BEGIN_GUARD
2310 #undef END_GUARD
2311 #undef CHECK
2313 void regex_free_machine(struct regex_machine *machine)
2315 sljit_free_code(machine->continue_match, NULL);
2316 SLJIT_FREE(machine, NULL);
2319 const char* regex_get_platform_name(void)
2321 return sljit_get_platform_name();
2324 /* --------------------------------------------------------------------- */
2325 /* Mathching utilities */
2326 /* --------------------------------------------------------------------- */
2328 struct regex_match* regex_begin_match(struct regex_machine *machine)
2330 sljit_sw *ptr1;
2331 sljit_sw *ptr2;
2332 sljit_sw *end;
2333 sljit_sw *entry_addrs;
2335 struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_sw), NULL);
2336 if (!match)
2337 return NULL;
2339 ptr1 = match->states;
2340 ptr2 = match->states + machine->size;
2341 end = ptr2;
2342 entry_addrs = (sljit_sw*)machine->entry_addrs;
2344 match->current = ptr1;
2345 match->next = ptr2;
2346 match->head = 0;
2347 match->machine = machine;
2349 /* Init machine states. */
2350 switch (machine->no_states) {
2351 case 2:
2352 while (ptr1 < end) {
2353 *ptr1++ = *entry_addrs;
2354 *ptr2++ = *entry_addrs++;
2355 *ptr1++ = -1;
2356 *ptr2++ = -1;
2358 break;
2360 case 3:
2361 while (ptr1 < end) {
2362 *ptr1++ = *entry_addrs;
2363 *ptr2++ = *entry_addrs++;
2364 *ptr1++ = -1;
2365 *ptr2++ = -1;
2366 *ptr1++ = 0;
2367 *ptr2++ = 0;
2369 break;
2371 case 4:
2372 while (ptr1 < end) {
2373 *ptr1++ = *entry_addrs;
2374 *ptr2++ = *entry_addrs++;
2375 *ptr1++ = -1;
2376 *ptr2++ = -1;
2377 *ptr1++ = 0;
2378 *ptr2++ = 0;
2379 *ptr1++ = 0;
2380 *ptr2++ = 0;
2382 break;
2384 default:
2385 SLJIT_UNREACHABLE();
2386 break;
2389 SLJIT_ASSERT(ptr1 == end);
2391 match->u.continue_match = machine->continue_match;
2393 regex_reset_match(match);
2394 return match;
2397 void regex_reset_match(struct regex_match *match)
2399 struct regex_machine *machine = match->machine;
2400 sljit_sw current, ind;
2401 sljit_sw *current_ptr;
2403 match->best_end = 0;
2404 match->fast_quit = 0;
2405 match->fast_forward = 0;
2407 if (match->head != 0) {
2408 /* Clear the current state. */
2409 current = match->head;
2410 current_ptr = match->current;
2411 do {
2412 ind = (current / sizeof(sljit_sw)) + 1;
2413 current = current_ptr[ind];
2414 current_ptr[ind] = -1;
2415 } while (current != 0);
2417 match->head = machine->u.call_init(match->current, match);
2420 void regex_free_match(struct regex_match *match)
2422 SLJIT_FREE(match, NULL);
2425 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length)
2427 match->u.call_continue(match, input_string, length);
2430 int regex_get_result(struct regex_match *match, int *end, int *id)
2432 int flags = match->machine->flags;
2433 sljit_sw no_states;
2435 *end = match->best_end;
2436 *id = match->best_id;
2437 if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END)))
2438 return match->best_begin;
2440 if (flags & REGEX_FAKE_MATCH_END) {
2441 SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END)));
2442 if (match->best_begin != -1)
2443 return match->best_begin;
2445 no_states = match->machine->no_states;
2446 if (match->current[no_states + 1] == -1)
2447 return -1;
2448 if (flags & REGEX_ID_CHECK)
2449 *id = match->current[no_states + 3];
2450 if (!(flags & REGEX_FAKE_MATCH_BEGIN))
2451 *end = match->index - 1;
2452 else
2453 *end = match->index - 2;
2454 return match->current[no_states + 2];
2456 else {
2457 /* Check the status of the last code. */
2458 if (!(flags & REGEX_MATCH_BEGIN)) {
2459 /* No shortcut in this case. */
2460 if (!(flags & REGEX_ID_CHECK)) {
2461 if (match->current[1] == -1)
2462 return -1;
2463 *end = match->index - 1;
2464 return match->current[2];
2467 if (match->current[1] == -1)
2468 return -1;
2469 *end = match->index - 1;
2470 *id = match->current[3];
2471 return match->current[2];
2474 /* Shortcut is possible in this case. */
2475 if (!(flags & REGEX_ID_CHECK)) {
2476 if (match->current[1] == -1 || match->head == -1)
2477 return -1;
2478 *end = match->index - 1;
2479 return 0;
2482 if (match->current[1] == -1 || match->head == -1)
2483 return -1;
2484 *end = match->index - 1;
2485 *id = match->current[2];
2486 return 0;
2490 int regex_is_match_finished(struct regex_match *match)
2492 return match->fast_quit;
2495 #ifdef REGEX_MATCH_VERBOSE
2496 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length)
2498 sljit_sw *ptr;
2499 sljit_sw *end;
2500 sljit_sw count;
2501 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2502 sljit_sw current;
2503 #endif
2504 sljit_sw no_states = match->machine->no_states;
2505 sljit_sw len = match->machine->size;
2507 while (length > 0) {
2508 match->u.call_continue(match, input_string, 1);
2510 if (match->fast_forward) {
2511 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2512 printf("fast forward\n");
2515 /* Verbose (first). */
2516 if (match->machine->flags & REGEX_MATCH_VERBOSE) {
2517 ptr = match->current;
2518 end = ptr + len;
2519 count = 0;
2520 printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id);
2521 while (ptr < end) {
2522 printf("[%3ld:", (long)count++);
2523 switch (no_states) {
2524 case 2:
2525 if (ptr[1] != -1)
2526 printf("+] ");
2527 else
2528 printf(" ] ");
2529 break;
2531 case 3:
2532 if (ptr[1] != -1)
2533 printf("+,%3ld] ", (long)ptr[2]);
2534 else
2535 printf(" ,XXX] ");
2536 break;
2538 case 4:
2539 if (ptr[1] != -1)
2540 printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]);
2541 else
2542 printf(" ,XXX,XXX] ");
2543 break;
2545 ptr += no_states;
2547 printf("\n");
2550 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2551 /* Sanity check (later). */
2552 ptr = match->next;
2553 end = ptr + len;
2554 while (ptr < end) {
2555 SLJIT_ASSERT(ptr[1] == -1);
2556 ptr += no_states;
2559 /* Check number of active elements. */
2560 ptr = match->current + no_states;
2561 end = ptr + len - no_states;
2562 count = 0;
2563 while (ptr < end) {
2564 if (ptr[1] != -1)
2565 count++;
2566 ptr += no_states;
2569 /* Check chain list. */
2570 current = match->head;
2571 ptr = match->current;
2572 while (current != 0) {
2573 SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_sw));
2574 SLJIT_ASSERT((current % (no_states * sizeof(sljit_sw))) == 0);
2575 SLJIT_ASSERT(count > 0);
2576 current = ptr[(current / sizeof(sljit_sw)) + 1];
2577 count--;
2579 SLJIT_ASSERT(count == 0);
2580 #endif
2582 if (match->fast_quit) {
2583 /* the machine has stopped working. */
2584 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2585 printf("Best match has found\n");
2586 break;
2589 input_string++;
2590 length--;
2593 #endif