Improve floating point compare.
[sljit.git] / regex_src / regexJIT.c
blob2eaecd96e146e026f64b8a2bc82351355dbe6164
1 /*
2 * Stack-less Just-In-Time compiler
4 * Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
6 * Redistribution and use in source and binary forms, with or without modification, are
7 * permitted provided that the following conditions are met:
9 * 1. Redistributions of source code must retain the above copyright notice, this list of
10 * conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice, this list
13 * of conditions and the following disclaimer in the documentation and/or other materials
14 * provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "sljitLir.h"
28 #include "regexJIT.h"
30 #include <stdlib.h>
32 #ifdef REGEX_MATCH_VERBOSE
33 #include <stdio.h>
34 #endif
36 /* Extra, hidden flags:
37 {id!} where id > 0 found in the code. */
38 #define REGEX_ID_CHECK 0x100
39 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search,
40 which starts with [\r\n] character range. */
41 #define REGEX_FAKE_MATCH_BEGIN 0x200
42 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search,
43 which ends with [\r\n] character range. */
44 #define REGEX_FAKE_MATCH_END 0x400
46 /* --------------------------------------------------------------------- */
47 /* Structures for JIT-ed pattern matching */
48 /* --------------------------------------------------------------------- */
50 struct regex_machine
52 /* flags. */
53 int flags;
54 /* Number of state descriptors for one term. */
55 sljit_sw no_states;
56 /* Total size. */
57 sljit_sw size;
59 union {
60 void *init_match;
61 sljit_sw (SLJIT_FUNC *call_init)(void *next, void* match);
62 } u;
63 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
64 struct sljit_function_context context;
65 #endif
67 void *continue_match;
69 /* Variable sized array to contain the handler addresses. */
70 sljit_uw entry_addrs[1];
73 struct regex_match
75 /* Current and next state array. */
76 sljit_sw *current;
77 sljit_sw *next;
78 /* Starting. */
79 sljit_sw head;
80 /* String character index (ever increasing). */
81 sljit_sw index;
82 /* Best match found so far (members in priority order). */
83 sljit_sw best_begin;
84 sljit_sw best_end;
85 sljit_sw best_id;
86 /* Bool flags (encoded as word). */
87 sljit_sw fast_quit;
88 sljit_sw fast_forward;
89 /* Machine. */
90 struct regex_machine *machine;
92 union {
93 void *continue_match;
94 void (SLJIT_FUNC *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length);
95 } u;
97 /* Variable sized array to contain the state arrays. */
98 sljit_sw states[1];
101 /* State vector
102 ITEM[0] - pointer to the address inside the machine code
103 ITEM[1] - next pointer
104 ITEM[2] - string started from (optional)
105 ITEM[3] - max ID (optional) */
107 /* Register allocation. */
108 /* Current state array (loaded & stored: regex_match->current). */
109 #define R_CURR_STATE SLJIT_S0
110 /* Next state array (loaded & stored: regex_match->next). */
111 #define R_NEXT_STATE SLJIT_S1
112 /* Head (loaded & stored: regex_match->head). */
113 #define R_NEXT_HEAD SLJIT_S2
114 /* String fragment pointer. */
115 #define R_STRING SLJIT_S3
116 /* String fragment length. */
117 #define R_LENGTH SLJIT_S4
118 /* 'struct regex_match*' */
119 #define R_REGEX_MATCH SLJIT_R0
120 /* Current character. */
121 #define R_CURR_CHAR SLJIT_R1
122 /* Temporary register. */
123 #define R_TEMP SLJIT_R2
124 /* Caches the regex_match->best_begin. */
125 #define R_BEST_BEGIN SLJIT_R3
126 /* Current character index. */
127 #define R_CURR_INDEX SLJIT_R4
129 /* --------------------------------------------------------------------- */
130 /* Stack management */
131 /* --------------------------------------------------------------------- */
133 /* Try to allocate 2^n blocks. */
134 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item)))
136 struct stack_item {
137 int type;
138 int value;
141 struct stack_fragment_data {
142 struct stack_fragment *next;
143 struct stack_fragment *prev;
146 struct stack_fragment {
147 struct stack_fragment_data data;
148 struct stack_item items[STACK_FRAGMENT_SIZE];
151 struct stack {
152 struct stack_fragment *first;
153 struct stack_fragment *last;
154 sljit_uw index;
155 sljit_uw count;
158 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
160 static void stack_check(struct stack *stack)
162 struct stack_fragment *curr;
163 int found;
165 if (!stack)
166 return;
168 SLJIT_ASSERT(stack->index < STACK_FRAGMENT_SIZE);
170 if (stack->first == NULL) {
171 SLJIT_ASSERT(stack->first == NULL && stack->last == NULL);
172 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
173 return;
176 found = 0;
177 if (stack->last == NULL) {
178 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
179 found = 1;
182 SLJIT_ASSERT(stack->first->data.prev == NULL);
183 curr = stack->first;
184 while (curr) {
185 if (curr == stack->last)
186 found = 1;
187 if (curr->data.next)
188 SLJIT_ASSERT(curr->data.next->data.prev == curr);
189 curr = curr->data.next;
191 SLJIT_ASSERT(found);
194 #endif
196 static void stack_init(struct stack *stack)
198 stack->first = NULL;
199 stack->last = NULL;
200 stack->index = STACK_FRAGMENT_SIZE - 1;
201 stack->count = 0;
204 static void stack_destroy(struct stack *stack)
206 struct stack_fragment *curr = stack->first;
207 struct stack_fragment *prev;
209 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
210 stack_check(stack);
211 #endif
213 while (curr) {
214 prev = curr;
215 curr = curr->data.next;
216 SLJIT_FREE(prev, NULL);
220 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack)
222 SLJIT_ASSERT(stack->last);
223 return stack->last->items + stack->index;
226 static int stack_push(struct stack *stack, int type, int value)
228 if (stack->last) {
229 stack->index++;
230 if (stack->index >= STACK_FRAGMENT_SIZE) {
231 stack->index = 0;
232 if (!stack->last->data.next) {
233 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
234 if (!stack->last->data.next)
235 return 1;
236 stack->last->data.next->data.next = NULL;
237 stack->last->data.next->data.prev = stack->last;
239 stack->last = stack->last->data.next;
242 else if (!stack->first) {
243 stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
244 if (!stack->last)
245 return 1;
246 stack->last->data.prev = NULL;
247 stack->last->data.next = NULL;
248 stack->first = stack->last;
249 stack->index = 0;
251 else {
252 stack->last = stack->first;
253 stack->index = 0;
255 stack->last->items[stack->index].type = type;
256 stack->last->items[stack->index].value = value;
257 stack->count++;
258 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
259 stack_check(stack);
260 #endif
261 return 0;
264 static struct stack_item* stack_pop(struct stack *stack)
266 struct stack_item *ret = stack_top(stack);
268 if (stack->index > 0)
269 stack->index--;
270 else {
271 stack->last = stack->last->data.prev;
272 stack->index = STACK_FRAGMENT_SIZE - 1;
275 stack->count--;
276 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
277 stack_check(stack);
278 #endif
279 return ret;
282 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst)
284 *dst = *src;
287 static int stack_push_copy(struct stack *stack, int items, int length)
289 struct stack_fragment *frag1;
290 struct stack_fragment *frag2;
291 sljit_uw ind1, ind2;
292 sljit_uw counter;
294 SLJIT_ASSERT(stack->count >= (sljit_uw)length && items <= length && items > 0);
296 /* Allocate the necessary elements. */
297 counter = (sljit_uw)items;
298 frag1 = stack->last;
299 ind1 = stack->index;
300 while (counter > 0) {
301 if (stack->index + counter >= STACK_FRAGMENT_SIZE) {
302 SLJIT_ASSERT(counter >= STACK_FRAGMENT_SIZE - stack->index - 1 + 1);
303 counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1;
304 stack->index = 0;
305 if (!stack->last->data.next) {
306 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
307 if (!stack->last->data.next)
308 return 1;
309 stack->last->data.next->data.next = NULL;
310 stack->last->data.next->data.prev = stack->last;
312 stack->last = stack->last->data.next;
314 else {
315 stack->index += counter;
316 counter = 0;
320 frag2 = stack->last;
321 ind2 = stack->index;
322 while (length > 0) {
323 frag2->items[ind2] = frag1->items[ind1];
325 if (ind1 == 0) {
326 ind1 = STACK_FRAGMENT_SIZE;
327 frag1 = frag1->data.prev;
329 if (ind2 == 0) {
330 ind2 = STACK_FRAGMENT_SIZE;
331 frag2 = frag2->data.prev;
334 ind1--;
335 ind2--;
336 length--;
339 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
340 stack_check(stack);
341 #endif
342 stack->count += (sljit_uw)items;
343 return 0;
346 /* --------------------------------------------------------------------- */
347 /* Parser */
348 /* --------------------------------------------------------------------- */
350 enum {
351 /* Common. */
352 type_begin,
353 type_end,
354 type_char,
355 type_newline,
356 type_id,
357 type_rng_start,
358 type_rng_end,
359 type_rng_char,
360 type_rng_left,
361 type_rng_right,
363 /* generator only. */
364 type_branch,
365 type_jump,
367 /* Parser only. */
368 type_open_br,
369 type_close_br,
370 type_select,
371 type_asterisk,
372 type_plus_sign,
373 type_qestion_mark
376 struct compiler_common {
377 /* Temporary stacks. */
378 struct stack stack;
379 struct stack depth;
380 /* REGEX_ flags. */
381 int flags;
382 /* Encoded size of the dfa representation. */
383 sljit_uw dfa_size;
384 /* Number of terms. */
385 sljit_sw terms_size;
386 /* Number of state descriptors for one term (same as machine->no_states). */
387 sljit_sw no_states;
388 /* Number of type_rng_(char|left)-s in the longest character range. */
389 sljit_sw longest_range_size;
391 /* DFA linear representation (size: dfa_size). */
392 struct stack_item *dfa_transitions;
393 /* Term id and search state pairs (size: dfa_size). */
394 struct stack_item *search_states;
396 /* sljit compiler */
397 struct sljit_compiler *compiler;
398 /* Machine data, which must be kept for later use. */
399 struct regex_machine *machine;
400 /* Temporary space for jumps (size: longest_range_size). */
401 struct sljit_jump **range_jump_list;
404 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result)
406 int value = 0;
408 SLJIT_ASSERT(length > 0);
409 if (*regex_string < '0' || *regex_string > '9') {
410 *result = -1;
411 return regex_string;
414 while (length > 0 && *regex_string >= '0' && *regex_string <= '9') {
415 value = value * 10 + (*regex_string - '0');
416 length--;
417 regex_string++;
420 *result = value;
421 return regex_string;
424 static int iterate(struct stack *stack, int min, int max)
426 struct stack it;
427 struct stack_item *item;
428 int count = -1;
429 int len = 0;
430 int depth = 0;
432 stack_clone(stack, &it);
434 /* Calculate size. */
435 while (count < 0) {
436 item = stack_pop(&it);
437 switch (item->type) {
438 case type_id:
439 case type_rng_end:
440 case type_rng_char:
441 case type_rng_left:
442 case type_rng_right:
443 case type_plus_sign:
444 case type_qestion_mark:
445 len++;
446 break;
448 case type_asterisk:
449 len += 2;
450 break;
452 case type_close_br:
453 depth++;
454 break;
456 case type_open_br:
457 SLJIT_ASSERT(depth > 0);
458 depth--;
459 if (depth == 0)
460 count = (int)it.count;
461 break;
463 case type_select:
464 SLJIT_ASSERT(depth > 0);
465 len += 2;
466 break;
468 default:
469 SLJIT_ASSERT(item->type != type_begin && item->type != type_end);
470 if (depth == 0)
471 count = (int)it.count;
472 len++;
473 break;
477 if (min == 0 && max == 0) {
478 /* {0,0} case, not {0,} case: delete subtree. */
479 stack_clone(&it, stack);
480 /* and put an empty bracket expression instead of it. */
481 if (stack_push(stack, type_open_br, 0))
482 return REGEX_MEMORY_ERROR;
483 if (stack_push(stack, type_close_br, 0))
484 return REGEX_MEMORY_ERROR;
485 return len;
488 count = (int)stack->count - count;
490 /* Put an open bracket before the sequence. */
491 if (stack_push_copy(stack, 1, count))
492 return -1;
494 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
495 SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0);
496 #else
497 stack_push(&it, type_open_br, 0);
498 #endif
500 /* Copy the data. */
501 if (max > 0) {
502 len = len * (max - 1);
503 max -= min;
504 /* Insert ? operators. */
505 len += max;
507 if (min > 0) {
508 min--;
509 while (min > 0) {
510 if (stack_push_copy(stack, count, count))
511 return -1;
512 min--;
514 if (max > 0) {
515 if (stack_push_copy(stack, count, count))
516 return -1;
517 if (stack_push(stack, type_qestion_mark, 0))
518 return REGEX_MEMORY_ERROR;
519 count++;
520 max--;
523 else {
524 SLJIT_ASSERT(max > 0);
525 max--;
526 count++;
527 if (stack_push(stack, type_qestion_mark, 0))
528 return REGEX_MEMORY_ERROR;
531 while (max > 0) {
532 if (stack_push_copy(stack, count, count))
533 return -1;
534 max--;
537 else {
538 SLJIT_ASSERT(min > 0);
539 min--;
540 /* Insert + operator. */
541 len = len * min + 1;
542 while (min > 0) {
543 if (stack_push_copy(stack, count, count))
544 return -1;
545 min--;
548 if (stack_push(stack, type_plus_sign, 0))
549 return REGEX_MEMORY_ERROR;
552 /* Close the opened bracket. */
553 if (stack_push(stack, type_close_br, 0))
554 return REGEX_MEMORY_ERROR;
556 return len;
559 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_uw *dfa_size, int begin)
561 /* We only know that *regex_string == { . */
562 int val1, val2;
563 const regex_char_t *base_from = regex_string;
564 const regex_char_t *from;
566 length--;
567 regex_string++;
569 /* Decode left value. */
570 val2 = -1;
571 if (length == 0)
572 return -2;
573 if (*regex_string == ',') {
574 val1 = 0;
575 length--;
576 regex_string++;
578 else {
579 from = regex_string;
580 regex_string = decode_number(regex_string, length, &val1);
581 if (val1 < 0)
582 return -2;
583 length -= (int)(regex_string - from);
585 if (length == 0)
586 return -2;
587 if (*regex_string == '}') {
588 val2 = val1;
589 if (val1 == 0)
590 val1 = -1;
592 else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') {
593 /* Non posix extension. */
594 if (stack_push(stack, type_id, val1))
595 return -1;
596 (*dfa_size)++;
597 return (int)(regex_string - base_from) + 1;
599 else {
600 if (*regex_string != ',')
601 return -2;
602 length--;
603 regex_string++;
607 if (begin)
608 return -2;
610 /* Decode right value. */
611 if (val2 == -1) {
612 if (length == 0)
613 return -2;
614 if (*regex_string == '}')
615 val2 = 0;
616 else {
617 from = regex_string;
618 regex_string = decode_number(regex_string, length, &val2);
619 length -= (int)(regex_string - from);
620 if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1)
621 return -2;
622 if (val2 == 0) {
623 SLJIT_ASSERT(val1 == 0);
624 val1 = -1;
629 /* Fast cases. */
630 if (val1 > 1 || val2 > 1) {
631 val1 = iterate(stack, val1, val2);
632 if (val1 < 0)
633 return -1;
634 *dfa_size += (sljit_uw)val1;
636 else if (val1 == 0 && val2 == 0) {
637 if (stack_push(stack, type_asterisk, 0))
638 return -1;
639 *dfa_size += 2;
641 else if (val1 == 1 && val2 == 0) {
642 if (stack_push(stack, type_plus_sign, 0))
643 return -1;
644 (*dfa_size)++;
646 else if (val1 == 0 && val2 == 1) {
647 if (stack_push(stack, type_qestion_mark, 0))
648 return -1;
649 (*dfa_size)++;
651 else if (val1 == -1) {
652 val1 = iterate(stack, 0, 0);
653 if (val1 < 0)
654 return -1;
655 *dfa_size -= (sljit_uw)val1;
656 SLJIT_ASSERT(*dfa_size >= 2);
658 else {
659 /* Ignore. */
660 SLJIT_ASSERT(val1 == 1 && val2 == 1);
662 return (int)(regex_string - base_from);
665 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
667 struct stack* stack = &compiler_common->stack;
668 const regex_char_t *base_from = regex_string;
669 regex_char_t left_char, right_char, tmp_char;
671 length--;
672 regex_string++;
674 if (length == 0)
675 return -2;
677 if (*regex_string != '^') {
678 if (stack_push(stack, type_rng_start, 0))
679 return -1;
681 else {
682 length--;
683 regex_string++;
685 if (length == 0)
686 return -2;
688 if (stack_push(stack, type_rng_start, 1))
689 return -1;
691 /* For both the type_rng_start & type_rng_end. */
692 compiler_common->dfa_size += 2;
694 /* Range must be at least 1 character. */
695 if (*regex_string == ']') {
696 length--;
697 regex_string++;
698 if (stack_push(stack, type_rng_char, ']'))
699 return -1;
700 compiler_common->dfa_size++;
703 while (1) {
704 if (length == 0)
705 return -2;
707 if (*regex_string == ']')
708 break;
710 if (*regex_string != '\\')
711 left_char = *regex_string;
712 else {
713 regex_string++;
714 length--;
715 if (length == 0)
716 return -2;
717 left_char = *regex_string;
719 regex_string++;
720 length--;
722 /* Is a range here? */
723 if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') {
724 regex_string++;
725 length--;
727 if (*regex_string != '\\')
728 right_char = *regex_string;
729 else {
730 regex_string++;
731 length--;
732 if (length == 0)
733 return -2;
734 right_char = *regex_string;
736 regex_string++;
737 length--;
739 if (left_char > right_char) {
740 /* Swap if necessary. */
741 tmp_char = left_char;
742 left_char = right_char;
743 right_char = tmp_char;
746 if (stack_push(stack, type_rng_left, left_char))
747 return -1;
748 if (stack_push(stack, type_rng_right, right_char))
749 return -1;
750 compiler_common->dfa_size += 2;
752 else {
753 if (stack_push(stack, type_rng_char, left_char))
754 return -1;
755 compiler_common->dfa_size++;
759 if (stack_push(stack, type_rng_end, 0))
760 return -1;
761 return (int)(regex_string - base_from);
764 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
766 /* Depth of bracketed expressions. */
767 int depth = 0;
768 /* Have we already found a term? '1' if not yet. */
769 int begin = 1;
770 /* Cache stack pointer. */
771 struct stack* stack = &compiler_common->stack;
772 int tmp;
774 /* Type_begin and type_end. */
775 compiler_common->dfa_size = 2;
776 stack_init(stack);
777 if (stack_push(stack, type_begin, 0))
778 return REGEX_MEMORY_ERROR;
780 if (length > 0 && *regex_string == '^') {
781 compiler_common->flags |= REGEX_MATCH_BEGIN;
782 length--;
783 regex_string++;
786 if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) {
787 /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */
788 compiler_common->flags &= ~REGEX_MATCH_BEGIN;
789 compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN;
790 /* and append a new-line search. */
791 if (stack_push(stack, type_newline, 0))
792 return REGEX_MEMORY_ERROR;
793 compiler_common->dfa_size++;
794 /* Begin intentionally kept as 1. */
797 while (length > 0) {
798 switch (*regex_string) {
799 case '\\' :
800 length--;
801 regex_string++;
802 if (length == 0)
803 return REGEX_INVALID_REGEX;
804 if (stack_push(stack, type_char, *regex_string))
805 return REGEX_MEMORY_ERROR;
806 begin = 0;
807 compiler_common->dfa_size++;
808 break;
810 case '.' :
811 if (stack_push(stack, type_rng_start, 1))
812 return REGEX_MEMORY_ERROR;
813 if (compiler_common->flags & REGEX_NEWLINE) {
814 if (stack_push(stack, type_rng_char, '\n'))
815 return REGEX_MEMORY_ERROR;
816 if (stack_push(stack, type_rng_char, '\r'))
817 return REGEX_MEMORY_ERROR;
818 compiler_common->dfa_size += 2;
820 if (stack_push(stack, type_rng_end, 1))
821 return REGEX_MEMORY_ERROR;
822 begin = 0;
823 compiler_common->dfa_size += 2;
824 break;
826 case '(' :
827 depth++;
828 if (stack_push(stack, type_open_br, 0))
829 return REGEX_MEMORY_ERROR;
830 begin = 1;
831 break;
833 case ')' :
834 if (depth == 0)
835 return REGEX_INVALID_REGEX;
836 depth--;
837 if (stack_push(stack, type_close_br, 0))
838 return REGEX_MEMORY_ERROR;
839 begin = 0;
840 break;
842 case '|' :
843 if (stack_push(stack, type_select, 0))
844 return REGEX_MEMORY_ERROR;
845 begin = 1;
846 compiler_common->dfa_size += 2;
847 break;
849 case '*' :
850 if (begin)
851 return REGEX_INVALID_REGEX;
852 if (stack_push(stack, type_asterisk, 0))
853 return REGEX_MEMORY_ERROR;
854 compiler_common->dfa_size += 2;
855 break;
857 case '?' :
858 case '+' :
859 if (begin)
860 return REGEX_INVALID_REGEX;
861 if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0))
862 return REGEX_MEMORY_ERROR;
863 compiler_common->dfa_size++;
864 break;
866 case '{' :
867 tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin);
869 if (tmp >= 0) {
870 length -= tmp;
871 regex_string += tmp;
873 else if (tmp == -1)
874 return REGEX_MEMORY_ERROR;
875 else {
876 /* Not a valid range expression. */
877 SLJIT_ASSERT(tmp == -2);
878 if (stack_push(stack, type_char, '{'))
879 return REGEX_MEMORY_ERROR;
880 compiler_common->dfa_size++;
882 break;
884 case '[' :
885 tmp = parse_char_range(regex_string, length, compiler_common);
886 if (tmp >= 0) {
887 length -= tmp;
888 regex_string += tmp;
890 else if (tmp == -1)
891 return REGEX_MEMORY_ERROR;
892 else {
893 SLJIT_ASSERT(tmp == -2);
894 return REGEX_INVALID_REGEX;
896 begin = 0;
897 break;
899 default:
900 if (length == 1 && *regex_string == '$') {
901 compiler_common->flags |= REGEX_MATCH_END;
902 break;
904 if (stack_push(stack, type_char, *regex_string))
905 return REGEX_MEMORY_ERROR;
906 begin = 0;
907 compiler_common->dfa_size++;
908 break;
910 length--;
911 regex_string++;
914 if (depth != 0)
915 return REGEX_INVALID_REGEX;
917 if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) {
918 /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */
919 compiler_common->flags &= ~REGEX_MATCH_END;
920 compiler_common->flags |= REGEX_FAKE_MATCH_END;
921 /* and append a new-line search. */
922 if (stack_push(stack, type_newline, 1))
923 return REGEX_MEMORY_ERROR;
924 compiler_common->dfa_size++;
925 /* Begin intentionally kept as 1. */
928 if (stack_push(stack, type_end, 0))
929 return REGEX_MEMORY_ERROR;
931 return REGEX_NO_ERROR;
934 /* --------------------------------------------------------------------- */
935 /* Generating machine state transitions */
936 /* --------------------------------------------------------------------- */
938 #define PUT_TRANSITION(typ, val) \
939 do { \
940 --transitions_ptr; \
941 transitions_ptr->type = typ; \
942 transitions_ptr->value = val; \
943 } while (0)
945 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth)
947 struct stack_item *item;
949 while (1) {
950 item = stack_top(depth);
952 switch (item->type) {
953 case type_asterisk:
954 SLJIT_ASSERT(transitions[item->value].type == type_branch);
955 transitions[item->value].value = (int)(transitions_ptr - transitions);
956 PUT_TRANSITION(type_branch, item->value + 1);
957 break;
959 case type_plus_sign:
960 SLJIT_ASSERT(transitions[item->value].type == type_branch);
961 transitions[item->value].value = (int)(transitions_ptr - transitions);
962 break;
964 case type_qestion_mark:
965 PUT_TRANSITION(type_branch, item->value);
966 break;
968 default:
969 return transitions_ptr;
971 stack_pop(depth);
975 static int generate_transitions(struct compiler_common *compiler_common)
977 struct stack *stack = &compiler_common->stack;
978 struct stack *depth = &compiler_common->depth;
979 struct stack_item *transitions_ptr;
980 struct stack_item *item;
982 stack_init(depth);
983 compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
984 if (!compiler_common->dfa_transitions)
985 return REGEX_MEMORY_ERROR;
987 /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */
988 transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size;
989 while (stack->count > 0) {
990 item = stack_pop(stack);
991 switch (item->type) {
992 case type_begin:
993 case type_open_br:
994 item = stack_pop(depth);
995 if (item->type == type_select)
996 PUT_TRANSITION(type_branch, item->value + 1);
997 else
998 SLJIT_ASSERT(item->type == type_close_br);
999 if (stack->count == 0)
1000 PUT_TRANSITION(type_begin, 0);
1001 else
1002 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1003 break;
1005 case type_end:
1006 case type_close_br:
1007 if (item->type == type_end)
1008 *--transitions_ptr = *item;
1009 if (stack_push(depth, type_close_br, (int)(transitions_ptr - compiler_common->dfa_transitions)))
1010 return REGEX_MEMORY_ERROR;
1011 break;
1013 case type_select:
1014 item = stack_top(depth);
1015 if (item->type == type_select) {
1016 SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump);
1017 PUT_TRANSITION(type_branch, item->value + 1);
1018 PUT_TRANSITION(type_jump, item->value);
1019 item->value = (int)(transitions_ptr - compiler_common->dfa_transitions);
1021 else {
1022 SLJIT_ASSERT(item->type == type_close_br);
1023 item->type = type_select;
1024 PUT_TRANSITION(type_jump, item->value);
1025 item->value = (int)(transitions_ptr - compiler_common->dfa_transitions);
1027 break;
1029 case type_asterisk:
1030 case type_plus_sign:
1031 case type_qestion_mark:
1032 if (item->type != type_qestion_mark)
1033 PUT_TRANSITION(type_branch, 0);
1034 if (stack_push(depth, item->type, (int)(transitions_ptr - compiler_common->dfa_transitions)))
1035 return REGEX_MEMORY_ERROR;
1036 break;
1038 case type_char:
1039 case type_newline:
1040 case type_rng_start:
1041 /* Requires handle_iteratives. */
1042 *--transitions_ptr = *item;
1043 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1044 break;
1046 default:
1047 *--transitions_ptr = *item;
1048 break;
1052 SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr);
1053 SLJIT_ASSERT(depth->count == 0);
1054 return REGEX_NO_ERROR;
1057 #undef PUT_TRANSITION
1059 #ifdef REGEX_MATCH_VERBOSE
1061 static void verbose_transitions(struct compiler_common *compiler_common)
1063 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1064 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1065 struct stack_item *search_states_ptr = compiler_common->search_states;
1066 int pos;
1068 printf("-----------------\nTransitions\n-----------------\n");
1069 pos = 0;
1070 while (transitions_ptr < transitions_end) {
1071 printf("[%3d] ", pos++);
1072 if (search_states_ptr->type >= 0)
1073 printf("(%3d) ", search_states_ptr->type);
1074 switch (transitions_ptr->type) {
1075 case type_begin:
1076 printf("type_begin\n");
1077 break;
1079 case type_end:
1080 printf("type_end\n");
1081 break;
1083 case type_char:
1084 if (transitions_ptr->value >= ' ')
1085 printf("type_char '%c'\n", transitions_ptr->value);
1086 else
1087 printf("type_char 0x%x\n", transitions_ptr->value);
1088 break;
1090 case type_newline:
1091 printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)");
1092 break;
1094 case type_id:
1095 printf("type_id %d\n", transitions_ptr->value);
1096 break;
1098 case type_rng_start:
1099 printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)");
1100 break;
1102 case type_rng_end:
1103 printf("type_rng_end\n");
1104 break;
1106 case type_rng_char:
1107 if (transitions_ptr->value >= ' ')
1108 printf("type_rng_char '%c'\n", transitions_ptr->value);
1109 else
1110 printf("type_rng_char 0x%x\n", transitions_ptr->value);
1111 break;
1113 case type_rng_left:
1114 if (transitions_ptr->value >= ' ')
1115 printf("type_rng_left '%c'\n", transitions_ptr->value);
1116 else
1117 printf("type_rng_left 0x%x\n", transitions_ptr->value);
1118 break;
1120 case type_rng_right:
1121 if (transitions_ptr->value >= ' ')
1122 printf("type_rng_right '%c'\n", transitions_ptr->value);
1123 else
1124 printf("type_rng_right 0x%x\n", transitions_ptr->value);
1125 break;
1127 case type_branch:
1128 printf("type_branch -> %d\n", transitions_ptr->value);
1129 break;
1131 case type_jump:
1132 printf("type_jump -> %d\n", transitions_ptr->value);
1133 break;
1135 default:
1136 printf("UNEXPECTED TYPE\n");
1137 break;
1139 transitions_ptr++;
1140 search_states_ptr++;
1142 printf("flags: ");
1143 if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)))
1144 printf("none ");
1145 if (compiler_common->flags & REGEX_MATCH_BEGIN)
1146 printf("REGEX_MATCH_BEGIN ");
1147 if (compiler_common->flags & REGEX_MATCH_END)
1148 printf("REGEX_MATCH_END ");
1149 if (compiler_common->flags & REGEX_NEWLINE)
1150 printf("REGEX_NEWLINE ");
1151 if (compiler_common->flags & REGEX_ID_CHECK)
1152 printf("REGEX_ID_CHECK ");
1153 if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)
1154 printf("REGEX_FAKE_MATCH_BEGIN ");
1155 if (compiler_common->flags & REGEX_FAKE_MATCH_END)
1156 printf("REGEX_FAKE_MATCH_END ");
1157 if (compiler_common->longest_range_size > 0)
1158 printf("(longest range: %ld) ", (long)compiler_common->longest_range_size);
1159 printf("\n");
1162 #endif
1164 /* --------------------------------------------------------------------- */
1165 /* Utilities */
1166 /* --------------------------------------------------------------------- */
1168 static int generate_search_states(struct compiler_common *compiler_common)
1170 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1171 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1172 struct stack_item *search_states_ptr;
1173 struct stack_item *rng_start = NULL;
1175 compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2;
1176 compiler_common->longest_range_size = 0;
1177 compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
1178 if (!compiler_common->search_states)
1179 return REGEX_MEMORY_ERROR;
1181 search_states_ptr = compiler_common->search_states;
1182 while (transitions_ptr < transitions_end) {
1183 switch (transitions_ptr->type) {
1184 case type_begin:
1185 case type_end:
1186 search_states_ptr->type = 0;
1187 break;
1189 case type_char:
1190 search_states_ptr->type = (int)compiler_common->terms_size++;
1191 break;
1193 case type_newline:
1194 if (transitions_ptr->value)
1195 search_states_ptr->type = 1;
1196 else
1197 search_states_ptr->type = (int)compiler_common->terms_size++;
1198 SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2);
1199 break;
1201 case type_id:
1202 if (transitions_ptr->value > 0)
1203 compiler_common->flags |= REGEX_ID_CHECK;
1204 search_states_ptr->type = -1;
1205 break;
1207 case type_rng_start:
1208 search_states_ptr->type = (int)compiler_common->terms_size;
1209 rng_start = search_states_ptr;
1210 break;
1212 case type_rng_end:
1213 search_states_ptr->type = (int)compiler_common->terms_size++;
1214 /* This is an over estimation. */
1215 if (compiler_common->longest_range_size < search_states_ptr - rng_start)
1216 compiler_common->longest_range_size = search_states_ptr - rng_start;
1217 break;
1219 default:
1220 search_states_ptr->type = -1;
1221 break;
1223 search_states_ptr->value = -1;
1224 search_states_ptr++;
1225 transitions_ptr++;
1227 return REGEX_NO_ERROR;
1230 static int trace_transitions(int from, struct compiler_common *compiler_common)
1232 int id = 0;
1233 struct stack *stack = &compiler_common->stack;
1234 struct stack *depth = &compiler_common->depth;
1235 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1236 struct stack_item *search_states = compiler_common->search_states;
1238 SLJIT_ASSERT(search_states[from].type >= 0);
1240 from++;
1242 /* Be prepared for any paths (loops, etc). */
1243 while (1) {
1244 if (dfa_transitions[from].type == type_id)
1245 if (id < dfa_transitions[from].value)
1246 id = dfa_transitions[from].value;
1248 if (search_states[from].value < id) {
1249 /* Forward step. */
1250 if (search_states[from].value == -1)
1251 if (stack_push(stack, 0, from))
1252 return REGEX_MEMORY_ERROR;
1253 search_states[from].value = id;
1255 if (dfa_transitions[from].type == type_branch) {
1256 if (stack_push(depth, id, from))
1257 return REGEX_MEMORY_ERROR;
1258 from++;
1259 continue;
1261 else if (dfa_transitions[from].type == type_jump) {
1262 from = dfa_transitions[from].value;
1263 continue;
1265 else if (search_states[from].type < 0) {
1266 from++;
1267 continue;
1271 /* Back tracking. */
1272 if (depth->count > 0) {
1273 id = stack_top(depth)->type;
1274 from = dfa_transitions[stack_pop(depth)->value].value;
1275 continue;
1277 return 0;
1281 /* --------------------------------------------------------------------- */
1282 /* Code generator */
1283 /* --------------------------------------------------------------------- */
1285 #define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * (sljit_sw)sizeof(sljit_sw))
1286 #define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * (sljit_sw)sizeof(sljit_sw)))
1288 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1289 CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4))
1291 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1292 CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1294 #define EMIT_OP2U(type, arg1, arg2, arg3, arg4) \
1295 CHECK(sljit_emit_op2u(compiler, type, arg1, arg2, arg3, arg4))
1297 #define EMIT_LABEL(label) \
1298 label = sljit_emit_label(compiler); \
1299 CHECK(!label)
1301 #define EMIT_JUMP(jump, type) \
1302 jump = sljit_emit_jump(compiler, type); \
1303 CHECK(!jump)
1305 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1306 jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \
1307 CHECK(!jump)
1309 /* CHECK depends on the use case. */
1311 #define CHECK(exp) \
1312 if (SLJIT_UNLIKELY(exp)) \
1313 return REGEX_MEMORY_ERROR
1315 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg)
1317 struct sljit_compiler *compiler = compiler_common->compiler;
1318 struct stack *stack = &compiler_common->stack;
1319 struct stack_item *search_states = compiler_common->search_states;
1320 int flags = compiler_common->flags;
1321 sljit_sw no_states = compiler_common->no_states;
1322 sljit_sw head = 0;
1323 sljit_sw offset, value;
1325 if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) {
1326 CHECK(trace_transitions(0, compiler_common));
1328 else {
1329 CHECK(trace_transitions(1, compiler_common));
1332 while (stack->count > 0) {
1333 value = stack_pop(stack)->value;
1334 if (search_states[value].type >= 0) {
1335 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1336 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1337 if (offset > 0)
1338 head = offset;
1340 if (!(flags & REGEX_MATCH_BEGIN)) {
1341 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0);
1342 if (flags & REGEX_ID_CHECK) {
1343 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value);
1346 else if (flags & REGEX_ID_CHECK) {
1347 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value);
1350 search_states[value].value = -1;
1352 if (reg == R_NEXT_STATE) {
1353 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1355 else if (flags & REGEX_FAKE_MATCH_BEGIN) {
1356 SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value);
1357 offset = TERM_OFFSET_OF(search_states[1].type, 0);
1359 SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN));
1361 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1362 head = offset;
1364 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1);
1365 if (flags & REGEX_ID_CHECK) {
1366 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0);
1369 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head);
1370 return REGEX_NO_ERROR;
1373 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index)
1375 struct sljit_compiler *compiler = compiler_common->compiler;
1376 struct stack *stack = &compiler_common->stack;
1377 struct stack_item *search_states = compiler_common->search_states;
1378 sljit_sw offset;
1379 int flags;
1380 sljit_sw no_states;
1381 sljit_sw value;
1382 struct sljit_jump *jump1;
1383 struct sljit_jump *jump2;
1384 struct sljit_jump *jump3;
1385 struct sljit_jump *jump4;
1386 struct sljit_jump *jump5;
1387 struct sljit_label *label1;
1389 flags = compiler_common->flags;
1390 no_states = compiler_common->no_states;
1392 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
1393 if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) {
1394 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1397 while (stack->count > 0) {
1398 value = stack_pop(stack)->value;
1399 if (search_states[value].type >= 0) {
1400 #ifdef REGEX_MATCH_VERBOSE
1401 if (flags & REGEX_MATCH_VERBOSE)
1402 printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value);
1403 #endif
1404 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1406 if (!(flags & REGEX_ID_CHECK)) {
1407 if (!(flags & REGEX_MATCH_BEGIN)) {
1408 /* Check whether item is inserted. */
1409 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), SLJIT_IMM, -1);
1410 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), R_NEXT_HEAD, 0);
1411 if (offset > 0) {
1412 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1414 EMIT_JUMP(jump2, SLJIT_JUMP);
1416 /* Check whether old index <= index. */
1417 EMIT_LABEL(label1);
1418 sljit_set_label(jump1, label1);
1420 EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1422 EMIT_LABEL(label1);
1423 sljit_set_label(jump2, label1);
1424 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1426 EMIT_LABEL(label1);
1427 sljit_set_label(jump1, label1);
1429 else {
1430 /* Check whether item is inserted. */
1431 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), SLJIT_IMM, -1);
1432 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), R_NEXT_HEAD, 0);
1433 if (offset > 0) {
1434 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1436 EMIT_LABEL(label1);
1437 sljit_set_label(jump1, label1);
1440 else {
1441 if (!(flags & REGEX_MATCH_BEGIN)) {
1442 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1444 /* Check whether item is inserted. */
1445 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), SLJIT_IMM, -1);
1446 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), R_NEXT_HEAD, 0);
1447 if (offset > 0) {
1448 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1450 EMIT_JUMP(jump2, SLJIT_JUMP);
1452 /* Check whether old index != index. */
1453 EMIT_LABEL(label1);
1454 sljit_set_label(jump1, label1);
1456 EMIT_OP2U(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1457 EMIT_JUMP(jump1, SLJIT_LESS);
1458 EMIT_JUMP(jump3, SLJIT_NOT_EQUAL); /* Greater. */
1460 /* Old index == index. */
1461 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1462 if (search_states[value].value > 0) {
1463 EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1465 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1466 EMIT_LABEL(label1);
1467 sljit_set_label(jump4, label1);
1470 EMIT_OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1471 EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL);
1472 EMIT_JUMP(jump5, SLJIT_JUMP);
1474 /* Overwrite index & id. */
1475 EMIT_LABEL(label1);
1476 sljit_set_label(jump3, label1);
1477 sljit_set_label(jump2, label1);
1478 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1480 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1481 if (search_states[value].value > 0) {
1482 EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1484 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1485 EMIT_LABEL(label1);
1486 sljit_set_label(jump3, label1);
1489 EMIT_LABEL(label1);
1490 sljit_set_label(jump5, label1);
1491 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1493 /* Exit. */
1494 EMIT_LABEL(label1);
1495 sljit_set_label(jump1, label1);
1496 sljit_set_label(jump4, label1);
1498 else {
1499 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1501 if (search_states[value].value > 0) {
1502 EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1504 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1505 EMIT_LABEL(label1);
1506 sljit_set_label(jump1, label1);
1509 /* Check whether item is inserted. */
1510 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), SLJIT_IMM, -1);
1511 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + (sljit_sw)sizeof(sljit_sw), R_NEXT_HEAD, 0);
1512 if (offset > 0) {
1513 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1515 EMIT_JUMP(jump2, SLJIT_JUMP);
1517 /* Check whether old id >= id. */
1518 EMIT_LABEL(label1);
1519 sljit_set_label(jump1, label1);
1521 EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1523 EMIT_LABEL(label1);
1524 sljit_set_label(jump2, label1);
1525 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * (sljit_sw)sizeof(sljit_sw), R_TEMP, 0);
1527 EMIT_LABEL(label1);
1528 sljit_set_label(jump1, label1);
1532 search_states[value].value = -1;
1535 #ifdef REGEX_MATCH_VERBOSE
1536 if (flags & REGEX_MATCH_VERBOSE)
1537 printf("\n");
1538 #endif
1539 return REGEX_NO_ERROR;
1542 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label)
1544 struct sljit_compiler *compiler = compiler_common->compiler;
1545 struct sljit_jump *jump;
1546 struct sljit_jump *clear_states_jump;
1547 struct sljit_label *label;
1548 struct sljit_label *leave_label;
1549 struct sljit_label *begin_loop_label;
1551 /* Priority order: best_begin > best_end > best_id.
1552 In other words:
1553 if (new best_begin > old test_begin) do nothing
1554 otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing
1555 therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */
1557 /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */
1559 if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) {
1560 EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1561 EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1562 sljit_set_label(jump, end_check_label);
1564 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1565 if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) {
1566 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1568 else {
1569 if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) {
1570 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2);
1572 else {
1573 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1);
1576 if (compiler_common->flags & REGEX_ID_CHECK) {
1577 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3));
1580 EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0);
1582 EMIT_LABEL(leave_label);
1583 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0);
1584 EMIT_JUMP(jump, SLJIT_JUMP);
1585 sljit_set_label(jump, end_check_label);
1587 /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */
1588 EMIT_LABEL(label);
1589 sljit_set_label(clear_states_jump, label);
1591 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1592 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
1594 /* Begin of the loop. */
1595 EMIT_LABEL(begin_loop_label);
1596 EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0);
1597 sljit_set_label(jump, leave_label);
1599 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0);
1600 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw));
1601 EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0);
1603 /* Case 1: keep this case. */
1604 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0);
1605 EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0);
1607 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1608 EMIT_JUMP(jump, SLJIT_JUMP);
1609 sljit_set_label(jump, begin_loop_label);
1611 /* Case 2: remove this case. */
1612 EMIT_LABEL(label);
1613 sljit_set_label(clear_states_jump, label);
1615 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1);
1617 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1618 EMIT_JUMP(jump, SLJIT_JUMP);
1619 sljit_set_label(jump, begin_loop_label);
1621 else {
1622 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0);
1623 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
1624 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1625 if (compiler_common->flags & REGEX_ID_CHECK) {
1626 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1628 EMIT_JUMP(jump, SLJIT_JUMP);
1629 sljit_set_label(jump, end_check_label);
1631 return REGEX_NO_ERROR;
1634 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label)
1636 struct sljit_compiler *compiler = compiler_common->compiler;
1637 struct stack *stack = &compiler_common->stack;
1638 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1639 struct stack_item *search_states = compiler_common->search_states;
1640 int ind;
1641 struct sljit_jump *jump;
1642 int init_range = 1, prev_value = 0;
1644 while (stack->count > 0) {
1645 ind = stack_pop(stack)->value;
1646 search_states[ind].value = -1;
1647 if (search_states[ind].type >= 0) {
1648 if (dfa_transitions[ind].type == type_char) {
1649 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1650 sljit_set_label(jump, fast_forward_label);
1652 else if (dfa_transitions[ind].type == type_rng_start) {
1653 SLJIT_ASSERT(!dfa_transitions[ind].value);
1654 ind++;
1655 while (dfa_transitions[ind].type != type_rng_end) {
1656 if (dfa_transitions[ind].type == type_rng_char) {
1657 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1658 sljit_set_label(jump, fast_forward_label);
1660 else {
1661 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1662 if (init_range) {
1663 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1664 init_range = 0;
1666 if (dfa_transitions[ind].value != prev_value) {
1667 /* Best compatibility to all archs. */
1668 prev_value -= dfa_transitions[ind].value;
1669 if (prev_value < 0) {
1670 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1672 else {
1673 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1675 prev_value = dfa_transitions[ind].value;
1677 EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1678 sljit_set_label(jump, fast_forward_label);
1679 ind++;
1681 ind++;
1684 else {
1685 SLJIT_ASSERT(dfa_transitions[ind].type == type_newline);
1686 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1687 sljit_set_label(jump, fast_forward_label);
1688 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1689 sljit_set_label(jump, fast_forward_label);
1693 return REGEX_NO_ERROR;
1696 static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind)
1698 struct sljit_compiler *compiler = compiler_common->compiler;
1699 struct sljit_jump *jump1;
1700 struct sljit_jump *jump2;
1701 struct sljit_label *label;
1702 sljit_sw no_states;
1703 sljit_sw offset;
1705 /* Check whether a new-line character is found. */
1706 EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1707 EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1709 no_states = compiler_common->no_states;
1710 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1711 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1712 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1713 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1715 EMIT_LABEL(label);
1716 sljit_set_label(jump1, label);
1717 sljit_set_label(jump2, label);
1718 return REGEX_NO_ERROR;
1721 #undef CHECK
1723 #define CHECK(exp) \
1724 if (SLJIT_UNLIKELY(exp)) \
1725 return 0
1727 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label)
1729 while (*range_jump_list) {
1730 sljit_set_label(*range_jump_list, label);
1731 range_jump_list++;
1735 static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind)
1737 struct sljit_compiler *compiler = compiler_common->compiler;
1738 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1739 struct sljit_jump **range_jump_list = compiler_common->range_jump_list;
1740 int invert = dfa_transitions[ind].value;
1741 struct sljit_label *label;
1742 sljit_sw no_states;
1743 sljit_sw offset;
1744 int init_range = 1, prev_value = 0;
1746 ind++;
1748 while (dfa_transitions[ind].type != type_rng_end) {
1749 if (dfa_transitions[ind].type == type_rng_char) {
1750 EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1751 range_jump_list++;
1753 else {
1754 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1755 if (init_range) {
1756 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1757 init_range = 0;
1759 if (dfa_transitions[ind].value != prev_value) {
1760 /* Best compatibility to all archs. */
1761 prev_value -= dfa_transitions[ind].value;
1762 if (prev_value < 0) {
1763 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1765 else {
1766 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1768 prev_value = dfa_transitions[ind].value;
1770 EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1771 range_jump_list++;
1772 ind++;
1774 ind++;
1777 *range_jump_list = NULL;
1779 if (!invert) {
1780 no_states = compiler_common->no_states;
1781 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1782 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1783 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1784 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1786 EMIT_LABEL(label);
1787 range_set_label(compiler_common->range_jump_list, label);
1788 /* Clears the jump list. */
1789 *compiler_common->range_jump_list = NULL;
1791 return ind;
1794 #undef TERM_OFFSET_OF
1795 #undef EMIT_OP1
1796 #undef EMIT_OP2
1797 #undef EMIT_LABEL
1798 #undef EMIT_JUMP
1799 #undef EMIT_CMP
1800 #undef CHECK
1802 /* --------------------------------------------------------------------- */
1803 /* Main compiler */
1804 /* --------------------------------------------------------------------- */
1806 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * (sljit_sw)sizeof(sljit_sw))
1808 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1809 CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4))
1811 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1812 CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1814 #define EMIT_LABEL(label) \
1815 label = sljit_emit_label(compiler_common.compiler); \
1816 CHECK(!label)
1818 #define EMIT_JUMP(jump, type) \
1819 jump = sljit_emit_jump(compiler_common.compiler, type); \
1820 CHECK(!jump)
1822 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1823 jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \
1824 CHECK(!jump)
1826 /* A do {} while(0) expression helps to avoid goto statements. */
1827 #define BEGIN_GUARD \
1828 do {
1830 #define END_GUARD \
1831 } while(0);
1833 #define CHECK(exp) \
1834 if (SLJIT_UNLIKELY(exp)) \
1835 break;
1837 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error)
1839 struct compiler_common compiler_common;
1840 sljit_sw ind;
1841 int error_code, done, suggest_fast_forward;
1842 /* ID of an empty match (-1 if not reachable). */
1843 int empty_match_id;
1845 struct sljit_jump *jump;
1846 struct sljit_jump *best_match_found_jump;
1847 struct sljit_jump *fast_forward_jump = NULL;
1848 struct sljit_jump *length_is_zero_jump;
1849 struct sljit_jump *end_check_jump = NULL;
1850 struct sljit_jump *best_match_check_jump = NULL;
1851 struct sljit_jump *non_greedy_end_jump = NULL;
1852 struct sljit_label *label;
1853 struct sljit_label *end_check_label = NULL;
1854 struct sljit_label *start_label;
1855 struct sljit_label *fast_forward_label;
1856 struct sljit_label *fast_forward_return_label;
1858 if (error)
1859 *error = REGEX_NO_ERROR;
1860 #ifdef REGEX_MATCH_VERBOSE
1861 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE);
1862 #else
1863 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE);
1864 #endif
1866 /* Step 1: parsing (Left->Right).
1867 Syntax check and AST generator. */
1868 error_code = parse(regex_string, length, &compiler_common);
1869 if (error_code) {
1870 stack_destroy(&compiler_common.stack);
1871 if (error)
1872 *error = error_code;
1873 return NULL;
1876 /* Step 2: generating branches (Right->Left). */
1877 error_code = generate_transitions(&compiler_common);
1878 stack_destroy(&compiler_common.stack);
1879 stack_destroy(&compiler_common.depth);
1880 if (error_code) {
1881 if (compiler_common.dfa_transitions)
1882 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1883 if (error)
1884 *error = error_code;
1885 return NULL;
1888 /* Step 3: Generate necessary data for depth-first search (Left->Right). */
1889 error_code = generate_search_states(&compiler_common);
1890 if (error_code) {
1891 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1892 if (error)
1893 *error = error_code;
1894 return NULL;
1897 #ifdef REGEX_MATCH_VERBOSE
1898 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1899 verbose_transitions(&compiler_common);
1900 #endif
1902 /* Step 4: Left->Right generate code. */
1903 stack_init(&compiler_common.stack);
1904 stack_init(&compiler_common.depth);
1905 done = 0;
1906 compiler_common.machine = NULL;
1907 compiler_common.compiler = NULL;
1908 compiler_common.range_jump_list = NULL;
1910 BEGIN_GUARD
1912 compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (sljit_uw)(compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL);
1913 CHECK(!compiler_common.machine);
1915 compiler_common.compiler = sljit_create_compiler(NULL, NULL);
1916 CHECK(!compiler_common.compiler);
1918 if (compiler_common.longest_range_size > 0) {
1919 compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * (sljit_uw)compiler_common.longest_range_size, NULL);
1920 CHECK(!compiler_common.range_jump_list);
1923 if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN))
1924 compiler_common.no_states = 4;
1925 else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN))
1926 compiler_common.no_states = 2;
1927 else
1928 compiler_common.no_states = 3;
1930 compiler_common.machine->flags = compiler_common.flags;
1931 compiler_common.machine->no_states = compiler_common.no_states;
1932 compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size;
1934 /* Study the regular expression. */
1935 empty_match_id = -1;
1936 suggest_fast_forward = 1;
1937 if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) {
1938 CHECK(trace_transitions(0, &compiler_common));
1939 while (compiler_common.stack.count > 0) {
1940 ind = stack_pop(&compiler_common.stack)->value;
1941 if (compiler_common.search_states[ind].type == 0) {
1942 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1943 suggest_fast_forward = 0;
1944 empty_match_id = compiler_common.search_states[ind].value;
1946 else if (compiler_common.search_states[ind].type > 0) {
1947 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end);
1948 if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value)
1949 suggest_fast_forward = 0;
1951 compiler_common.search_states[ind].value = -1;
1954 else {
1955 SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline);
1956 CHECK(trace_transitions(1, &compiler_common));
1957 while (compiler_common.stack.count > 0) {
1958 ind = stack_pop(&compiler_common.stack)->value;
1959 if (compiler_common.search_states[ind].type == 0) {
1960 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1961 suggest_fast_forward = 0;
1962 empty_match_id = compiler_common.search_states[ind].value;
1964 compiler_common.search_states[ind].value = -1;
1968 /* Step 4.1: Generate entry. */
1969 CHECK(sljit_emit_enter(compiler_common.compiler, 0, SLJIT_ARGS3(VOID, P, P, 32), 5, 5, 0, 0, 0));
1971 /* Copy arguments to their place. */
1972 EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0);
1973 EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0);
1974 EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1);
1976 /* Init global registers. */
1977 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
1978 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
1979 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
1980 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin));
1981 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index));
1983 /* Check whether the best match has already found in a previous frame. */
1984 EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0);
1985 EMIT_JUMP(best_match_found_jump, SLJIT_JUMP);
1987 #ifdef REGEX_MATCH_VERBOSE
1988 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1989 printf("\n-----------------\nTrace\n-----------------\n");
1990 #endif
1992 /* Step 4.2: Generate code for state 0. */
1993 EMIT_LABEL(label);
1994 sljit_emit_op0(compiler_common.compiler, SLJIT_ENDBR);
1995 compiler_common.machine->entry_addrs[0] = (sljit_uw)label;
1997 /* Swapping current and next. */
1998 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0);
1999 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0);
2000 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0);
2002 /* Checking whether the best case needs to be updated. */
2003 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2004 EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2005 EMIT_LABEL(end_check_label);
2007 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2008 EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1);
2010 /* Checking whether best case has already found. */
2011 if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) {
2012 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2013 /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */
2014 EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2016 else {
2017 /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */
2018 EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2022 EMIT_LABEL(start_label);
2023 sljit_set_label(jump, start_label);
2025 if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) {
2026 EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2029 /* Loading the next character. */
2030 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1);
2031 EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL);
2033 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0);
2034 #ifdef REGEX_USE_8BIT_CHARS
2035 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2036 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1);
2037 #else
2038 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2039 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2);
2040 #endif
2041 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0);
2043 #ifdef REGEX_MATCH_VERBOSE
2044 if (compiler_common.flags & REGEX_MATCH_VERBOSE) {
2045 printf("(%3d): ", 0);
2046 CHECK(trace_transitions(0, &compiler_common));
2047 while (compiler_common.stack.count > 0) {
2048 ind = stack_pop(&compiler_common.stack)->value;
2049 if (compiler_common.search_states[ind].type >= 0)
2050 printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value);
2051 compiler_common.search_states[ind].value = -1;
2053 printf("\n");
2055 #endif
2057 EMIT_LABEL(fast_forward_return_label);
2058 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2059 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1);
2060 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2061 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2064 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0);
2065 CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE));
2066 /* And branching to the first state. */
2067 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2069 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2070 EMIT_LABEL(label);
2071 sljit_set_label(jump, label);
2074 /* This is the case where we only have to reset the R_NEXT_HEAD. */
2075 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
2076 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2077 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2079 /* Fast-forward loop. */
2080 if (fast_forward_jump) {
2081 /* Quit from fast-forward loop. */
2082 EMIT_LABEL(fast_forward_label);
2083 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2084 EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0);
2085 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0);
2086 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0);
2087 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
2088 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
2089 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
2091 /* Update the start field of the locations. */
2092 CHECK(trace_transitions(0, &compiler_common));
2093 while (compiler_common.stack.count > 0) {
2094 ind = stack_pop(&compiler_common.stack)->value;
2095 if (compiler_common.search_states[ind].type >= 0) {
2096 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0);
2098 compiler_common.search_states[ind].value = -1;
2100 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2101 EMIT_JUMP(jump, SLJIT_JUMP);
2102 sljit_set_label(jump, fast_forward_return_label);
2104 /* Start fast-forward. */
2105 EMIT_LABEL(label);
2106 sljit_set_label(fast_forward_jump, label);
2108 /* Moving everything to registers. */
2109 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2110 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2111 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2112 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0);
2113 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0);
2114 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0);
2116 /* Fast forward mainloop. */
2117 EMIT_LABEL(label);
2118 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1);
2119 EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL);
2121 #ifdef REGEX_USE_8BIT_CHARS
2122 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2123 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1);
2124 #else
2125 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2126 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2);
2127 #endif
2129 CHECK(trace_transitions(0, &compiler_common));
2130 CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label));
2132 EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2133 EMIT_JUMP(jump, SLJIT_JUMP);
2134 sljit_set_label(jump, label);
2136 /* String is finished. */
2137 EMIT_LABEL(label);
2138 sljit_set_label(fast_forward_jump, label);
2139 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0);
2140 EMIT_JUMP(fast_forward_jump, SLJIT_JUMP);
2143 /* End check. */
2144 if (end_check_jump) {
2145 EMIT_LABEL(label);
2146 sljit_set_label(end_check_jump, label);
2148 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2149 CHECK(compile_end_check(&compiler_common, end_check_label));
2151 else {
2152 /* Since we leave, we do not need to update the R_BEST_BEGIN. */
2153 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2154 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
2155 if (compiler_common.flags & REGEX_ID_CHECK) {
2156 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
2158 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2159 EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP);
2163 /* Finish check. */
2164 if (best_match_check_jump) {
2165 EMIT_LABEL(label);
2166 sljit_set_label(best_match_check_jump, label);
2168 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2169 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2170 sljit_set_label(jump, start_label);
2172 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2175 /* Leaving matching and storing the necessary values. */
2176 EMIT_LABEL(label);
2177 sljit_set_label(length_is_zero_jump, label);
2178 if (non_greedy_end_jump)
2179 sljit_set_label(non_greedy_end_jump, label);
2181 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0);
2182 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2183 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2184 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2186 /* Exit from JIT. */
2187 EMIT_LABEL(label);
2188 sljit_set_label(best_match_found_jump, label);
2189 if (fast_forward_jump)
2190 sljit_set_label(fast_forward_jump, label);
2191 CHECK(sljit_emit_return_void(compiler_common.compiler));
2193 for (ind = 1; ind < (sljit_sw)compiler_common.dfa_size - 1; ind++) {
2194 if (compiler_common.search_states[ind].type >= 0) {
2195 SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size);
2196 EMIT_LABEL(label);
2197 sljit_emit_op0(compiler_common.compiler, SLJIT_ENDBR);
2198 compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label;
2200 if (compiler_common.dfa_transitions[ind].type == type_char) {
2201 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value);
2203 else if (compiler_common.dfa_transitions[ind].type == type_rng_start) {
2204 ind = compile_range_check(&compiler_common, ind);
2205 CHECK(!ind);
2207 else {
2208 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2209 CHECK(compile_newline_check(&compiler_common, ind));
2212 CHECK(trace_transitions((int)ind, &compiler_common));
2213 #ifdef REGEX_MATCH_VERBOSE
2214 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2215 printf("(%3d): ", compiler_common.search_states[ind].type);
2216 #endif
2217 CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type));
2219 if (compiler_common.dfa_transitions[ind].type == type_char) {
2220 EMIT_LABEL(label);
2221 sljit_set_label(jump, label);
2223 else if (compiler_common.dfa_transitions[ind].type == type_rng_end) {
2224 EMIT_LABEL(label);
2225 range_set_label(compiler_common.range_jump_list, label);
2227 else {
2228 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2231 /* Branch to the next item in the list. */
2232 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1));
2233 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1);
2234 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2238 if (ind == (sljit_sw)compiler_common.dfa_size - 1) {
2239 /* Generate an init stub function. */
2240 EMIT_LABEL(label);
2241 CHECK(sljit_emit_enter(compiler_common.compiler, 0, SLJIT_ARGS2(W, P, P), 3, 3, 0, 0, 0));
2243 if (empty_match_id == -1) {
2244 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1);
2245 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0);
2247 else {
2248 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2249 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id);
2252 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2);
2254 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) {
2255 /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */
2256 SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0);
2257 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2258 /* R_CURR_INDEX (put to R_TEMP) is zero. */
2259 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0);
2261 CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE));
2263 else {
2264 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2266 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0));
2268 compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler);
2269 #ifndef SLJIT_INDIRECT_CALL
2270 compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label);
2271 #else
2272 sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile);
2273 #endif
2274 #ifdef REGEX_MATCH_VERBOSE
2275 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2276 printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match);
2277 #endif
2278 if (compiler_common.machine->continue_match) {
2279 for (ind = 0; ind < compiler_common.terms_size; ++ind)
2280 compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]);
2281 done = 1;
2284 END_GUARD
2286 stack_destroy(&compiler_common.stack);
2287 stack_destroy(&compiler_common.depth);
2288 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
2289 SLJIT_FREE(compiler_common.search_states, NULL);
2290 if (compiler_common.range_jump_list)
2291 SLJIT_FREE(compiler_common.range_jump_list, NULL);
2292 if (compiler_common.compiler)
2293 sljit_free_compiler(compiler_common.compiler);
2294 if (done)
2295 return compiler_common.machine;
2297 if (compiler_common.machine) {
2298 SLJIT_FREE(compiler_common.machine, NULL);
2300 if (error)
2301 *error = REGEX_MEMORY_ERROR;
2302 return NULL;
2305 #undef TERM_OFFSET_OF
2306 #undef EMIT_OP1
2307 #undef EMIT_OP2
2308 #undef EMIT_LABEL
2309 #undef EMIT_JUMP
2310 #undef EMIT_CMP
2311 #undef BEGIN_GUARD
2312 #undef END_GUARD
2313 #undef CHECK
2315 void regex_free_machine(struct regex_machine *machine)
2317 sljit_free_code(machine->continue_match, NULL);
2318 SLJIT_FREE(machine, NULL);
2321 const char* regex_get_platform_name(void)
2323 return sljit_get_platform_name();
2326 /* --------------------------------------------------------------------- */
2327 /* Mathching utilities */
2328 /* --------------------------------------------------------------------- */
2330 struct regex_match* regex_begin_match(struct regex_machine *machine)
2332 sljit_sw *ptr1;
2333 sljit_sw *ptr2;
2334 sljit_sw *end;
2335 sljit_sw *entry_addrs;
2337 struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (sljit_uw)(machine->size * 2 - 1) * sizeof(sljit_sw), NULL);
2338 if (!match)
2339 return NULL;
2341 ptr1 = match->states;
2342 ptr2 = match->states + machine->size;
2343 end = ptr2;
2344 entry_addrs = (sljit_sw*)machine->entry_addrs;
2346 match->current = ptr1;
2347 match->next = ptr2;
2348 match->head = 0;
2349 match->machine = machine;
2351 /* Init machine states. */
2352 switch (machine->no_states) {
2353 case 2:
2354 while (ptr1 < end) {
2355 *ptr1++ = *entry_addrs;
2356 *ptr2++ = *entry_addrs++;
2357 *ptr1++ = -1;
2358 *ptr2++ = -1;
2360 break;
2362 case 3:
2363 while (ptr1 < end) {
2364 *ptr1++ = *entry_addrs;
2365 *ptr2++ = *entry_addrs++;
2366 *ptr1++ = -1;
2367 *ptr2++ = -1;
2368 *ptr1++ = 0;
2369 *ptr2++ = 0;
2371 break;
2373 case 4:
2374 while (ptr1 < end) {
2375 *ptr1++ = *entry_addrs;
2376 *ptr2++ = *entry_addrs++;
2377 *ptr1++ = -1;
2378 *ptr2++ = -1;
2379 *ptr1++ = 0;
2380 *ptr2++ = 0;
2381 *ptr1++ = 0;
2382 *ptr2++ = 0;
2384 break;
2386 default:
2387 SLJIT_UNREACHABLE();
2388 break;
2391 SLJIT_ASSERT(ptr1 == end);
2393 match->u.continue_match = machine->continue_match;
2395 regex_reset_match(match);
2396 return match;
2399 void regex_reset_match(struct regex_match *match)
2401 struct regex_machine *machine = match->machine;
2402 sljit_sw current, ind;
2403 sljit_sw *current_ptr;
2405 match->best_end = 0;
2406 match->fast_quit = 0;
2407 match->fast_forward = 0;
2409 if (match->head != 0) {
2410 /* Clear the current state. */
2411 current = match->head;
2412 current_ptr = match->current;
2413 do {
2414 ind = (current / (sljit_sw)sizeof(sljit_sw)) + 1;
2415 current = current_ptr[ind];
2416 current_ptr[ind] = -1;
2417 } while (current != 0);
2419 match->head = machine->u.call_init(match->current, match);
2422 void regex_free_match(struct regex_match *match)
2424 SLJIT_FREE(match, NULL);
2427 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length)
2429 match->u.call_continue(match, input_string, length);
2432 int regex_get_result(struct regex_match *match, int *end, int *id)
2434 int flags = match->machine->flags;
2435 sljit_sw no_states;
2437 *end = (int)match->best_end;
2438 *id = (int)match->best_id;
2439 if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END)))
2440 return (int)match->best_begin;
2442 if (flags & REGEX_FAKE_MATCH_END) {
2443 SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END)));
2444 if (match->best_begin != -1)
2445 return (int)match->best_begin;
2447 no_states = match->machine->no_states;
2448 if (match->current[no_states + 1] == -1)
2449 return -1;
2450 if (flags & REGEX_ID_CHECK)
2451 *id = (int)match->current[no_states + 3];
2452 if (!(flags & REGEX_FAKE_MATCH_BEGIN))
2453 *end = (int)match->index - 1;
2454 else
2455 *end = (int)match->index - 2;
2456 return (int)match->current[no_states + 2];
2458 else {
2459 /* Check the status of the last code. */
2460 if (!(flags & REGEX_MATCH_BEGIN)) {
2461 /* No shortcut in this case. */
2462 if (!(flags & REGEX_ID_CHECK)) {
2463 if (match->current[1] == -1)
2464 return -1;
2465 *end = (int)match->index - 1;
2466 return (int)match->current[2];
2469 if (match->current[1] == -1)
2470 return -1;
2471 *end = (int)match->index - 1;
2472 *id = (int)match->current[3];
2473 return (int)match->current[2];
2476 /* Shortcut is possible in this case. */
2477 if (!(flags & REGEX_ID_CHECK)) {
2478 if (match->current[1] == -1 || match->head == -1)
2479 return -1;
2480 *end = (int)match->index - 1;
2481 return 0;
2484 if (match->current[1] == -1 || match->head == -1)
2485 return -1;
2486 *end = (int)match->index - 1;
2487 *id = (int)match->current[2];
2488 return 0;
2492 int regex_is_match_finished(struct regex_match *match)
2494 return (int)match->fast_quit;
2497 #ifdef REGEX_MATCH_VERBOSE
2498 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length)
2500 sljit_sw *ptr;
2501 sljit_sw *end;
2502 sljit_sw count;
2503 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2504 sljit_sw current;
2505 #endif
2506 sljit_sw no_states = match->machine->no_states;
2507 sljit_sw len = match->machine->size;
2509 while (length > 0) {
2510 match->u.call_continue(match, input_string, 1);
2512 if (match->fast_forward) {
2513 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2514 printf("fast forward\n");
2517 /* Verbose (first). */
2518 if (match->machine->flags & REGEX_MATCH_VERBOSE) {
2519 ptr = match->current;
2520 end = ptr + len;
2521 count = 0;
2522 printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id);
2523 while (ptr < end) {
2524 printf("[%3ld:", (long)count++);
2525 switch (no_states) {
2526 case 2:
2527 if (ptr[1] != -1)
2528 printf("+] ");
2529 else
2530 printf(" ] ");
2531 break;
2533 case 3:
2534 if (ptr[1] != -1)
2535 printf("+,%3ld] ", (long)ptr[2]);
2536 else
2537 printf(" ,XXX] ");
2538 break;
2540 case 4:
2541 if (ptr[1] != -1)
2542 printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]);
2543 else
2544 printf(" ,XXX,XXX] ");
2545 break;
2547 ptr += no_states;
2549 printf("\n");
2552 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2553 /* Sanity check (later). */
2554 ptr = match->next;
2555 end = ptr + len;
2556 while (ptr < end) {
2557 SLJIT_ASSERT(ptr[1] == -1);
2558 ptr += no_states;
2561 /* Check number of active elements. */
2562 ptr = match->current + no_states;
2563 end = ptr + len - no_states;
2564 count = 0;
2565 while (ptr < end) {
2566 if (ptr[1] != -1)
2567 count++;
2568 ptr += no_states;
2571 /* Check chain list. */
2572 current = match->head;
2573 ptr = match->current;
2574 while (current != 0) {
2575 SLJIT_ASSERT(current >= 0 && current < len * (sljit_sw)sizeof(sljit_sw));
2576 SLJIT_ASSERT((current % (no_states * (sljit_sw)sizeof(sljit_sw))) == 0);
2577 SLJIT_ASSERT(count > 0);
2578 current = ptr[(current / (sljit_sw)sizeof(sljit_sw)) + 1];
2579 count--;
2581 SLJIT_ASSERT(count == 0);
2582 #endif
2584 if (match->fast_quit) {
2585 /* the machine has stopped working. */
2586 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2587 printf("Best match has found\n");
2588 break;
2591 input_string++;
2592 length--;
2595 #endif