* include/ruby/io.h (rb_io_t): new fields: writeconv,
[ruby-svn.git] / regcomp.c
blobcb54c441451f7a09a630ce9e19f48fefdead62c5
1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
30 #include "regparse.h"
32 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
34 extern OnigCaseFoldType
35 onig_get_default_case_fold_flag(void)
37 return OnigDefaultCaseFoldFlag;
40 extern int
41 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
43 OnigDefaultCaseFoldFlag = case_fold_flag;
44 return 0;
48 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
49 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
50 #endif
52 static UChar*
53 str_dup(UChar* s, UChar* end)
55 int len = end - s;
57 if (len > 0) {
58 UChar* r = (UChar* )xmalloc(len + 1);
59 CHECK_NULL_RETURN(r);
60 xmemcpy(r, s, len);
61 r[len] = (UChar )0;
62 return r;
64 else return NULL;
67 static void
68 swap_node(Node* a, Node* b)
70 Node c;
71 c = *a; *a = *b; *b = c;
73 if (NTYPE(a) == NT_STR) {
74 StrNode* sn = NSTR(a);
75 if (sn->capa == 0) {
76 int len = sn->end - sn->s;
77 sn->s = sn->buf;
78 sn->end = sn->s + len;
82 if (NTYPE(b) == NT_STR) {
83 StrNode* sn = NSTR(b);
84 if (sn->capa == 0) {
85 int len = sn->end - sn->s;
86 sn->s = sn->buf;
87 sn->end = sn->s + len;
92 static OnigDistance
93 distance_add(OnigDistance d1, OnigDistance d2)
95 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
96 return ONIG_INFINITE_DISTANCE;
97 else {
98 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
99 else return ONIG_INFINITE_DISTANCE;
103 static OnigDistance
104 distance_multiply(OnigDistance d, int m)
106 if (m == 0) return 0;
108 if (d < ONIG_INFINITE_DISTANCE / m)
109 return d * m;
110 else
111 return ONIG_INFINITE_DISTANCE;
114 static int
115 bitset_is_empty(BitSetRef bs)
117 int i;
118 for (i = 0; i < (int )BITSET_SIZE; i++) {
119 if (bs[i] != 0) return 0;
121 return 1;
124 #ifdef ONIG_DEBUG
125 static int
126 bitset_on_num(BitSetRef bs)
128 int i, n;
130 n = 0;
131 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
132 if (BITSET_AT(bs, i)) n++;
134 return n;
136 #endif
138 extern int
139 onig_bbuf_init(BBuf* buf, int size)
141 if (size <= 0) {
142 size = 0;
143 buf->p = NULL;
145 else {
146 buf->p = (UChar* )xmalloc(size);
147 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
150 buf->alloc = size;
151 buf->used = 0;
152 return 0;
156 #ifdef USE_SUBEXP_CALL
158 static int
159 unset_addr_list_init(UnsetAddrList* uslist, int size)
161 UnsetAddr* p;
163 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
164 CHECK_NULL_RETURN_MEMERR(p);
165 uslist->num = 0;
166 uslist->alloc = size;
167 uslist->us = p;
168 return 0;
171 static void
172 unset_addr_list_end(UnsetAddrList* uslist)
174 if (IS_NOT_NULL(uslist->us))
175 xfree(uslist->us);
178 static int
179 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
181 UnsetAddr* p;
182 int size;
184 if (uslist->num >= uslist->alloc) {
185 size = uslist->alloc * 2;
186 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
187 CHECK_NULL_RETURN_MEMERR(p);
188 uslist->alloc = size;
189 uslist->us = p;
192 uslist->us[uslist->num].offset = offset;
193 uslist->us[uslist->num].target = node;
194 uslist->num++;
195 return 0;
197 #endif /* USE_SUBEXP_CALL */
200 static int
201 add_opcode(regex_t* reg, int opcode)
203 BBUF_ADD1(reg, opcode);
204 return 0;
207 #ifdef USE_COMBINATION_EXPLOSION_CHECK
208 static int
209 add_state_check_num(regex_t* reg, int num)
211 StateCheckNumType n = (StateCheckNumType )num;
213 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
214 return 0;
216 #endif
218 static int
219 add_rel_addr(regex_t* reg, int addr)
221 RelAddrType ra = (RelAddrType )addr;
223 BBUF_ADD(reg, &ra, SIZE_RELADDR);
224 return 0;
227 static int
228 add_abs_addr(regex_t* reg, int addr)
230 AbsAddrType ra = (AbsAddrType )addr;
232 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
233 return 0;
236 static int
237 add_length(regex_t* reg, int len)
239 LengthType l = (LengthType )len;
241 BBUF_ADD(reg, &l, SIZE_LENGTH);
242 return 0;
245 static int
246 add_mem_num(regex_t* reg, int num)
248 MemNumType n = (MemNumType )num;
250 BBUF_ADD(reg, &n, SIZE_MEMNUM);
251 return 0;
254 static int
255 add_pointer(regex_t* reg, void* addr)
257 PointerType ptr = (PointerType )addr;
259 BBUF_ADD(reg, &ptr, SIZE_POINTER);
260 return 0;
263 static int
264 add_option(regex_t* reg, OnigOptionType option)
266 BBUF_ADD(reg, &option, SIZE_OPTION);
267 return 0;
270 static int
271 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
273 int r;
275 r = add_opcode(reg, opcode);
276 if (r) return r;
277 r = add_rel_addr(reg, addr);
278 return r;
281 static int
282 add_bytes(regex_t* reg, UChar* bytes, int len)
284 BBUF_ADD(reg, bytes, len);
285 return 0;
288 static int
289 add_bitset(regex_t* reg, BitSetRef bs)
291 BBUF_ADD(reg, bs, SIZE_BITSET);
292 return 0;
295 static int
296 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
298 int r;
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_option(reg, option);
303 return r;
306 static int compile_length_tree(Node* node, regex_t* reg);
307 static int compile_tree(Node* node, regex_t* reg);
310 #define IS_NEED_STR_LEN_OP_EXACT(op) \
311 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
312 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
314 static int
315 select_str_opcode(int mb_len, int str_len, int ignore_case)
317 int op;
319 if (ignore_case) {
320 switch (str_len) {
321 case 1: op = OP_EXACT1_IC; break;
322 default: op = OP_EXACTN_IC; break;
325 else {
326 switch (mb_len) {
327 case 1:
328 switch (str_len) {
329 case 1: op = OP_EXACT1; break;
330 case 2: op = OP_EXACT2; break;
331 case 3: op = OP_EXACT3; break;
332 case 4: op = OP_EXACT4; break;
333 case 5: op = OP_EXACT5; break;
334 default: op = OP_EXACTN; break;
336 break;
338 case 2:
339 switch (str_len) {
340 case 1: op = OP_EXACTMB2N1; break;
341 case 2: op = OP_EXACTMB2N2; break;
342 case 3: op = OP_EXACTMB2N3; break;
343 default: op = OP_EXACTMB2N; break;
345 break;
347 case 3:
348 op = OP_EXACTMB3N;
349 break;
351 default:
352 op = OP_EXACTMBN;
353 break;
356 return op;
359 static int
360 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
362 int r;
363 int saved_num_null_check = reg->num_null_check;
365 if (empty_info != 0) {
366 r = add_opcode(reg, OP_NULL_CHECK_START);
367 if (r) return r;
368 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
369 if (r) return r;
370 reg->num_null_check++;
373 r = compile_tree(node, reg);
374 if (r) return r;
376 if (empty_info != 0) {
377 if (empty_info == NQ_TARGET_IS_EMPTY)
378 r = add_opcode(reg, OP_NULL_CHECK_END);
379 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
380 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
381 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
382 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
384 if (r) return r;
385 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
387 return r;
390 #ifdef USE_SUBEXP_CALL
391 static int
392 compile_call(CallNode* node, regex_t* reg)
394 int r;
396 r = add_opcode(reg, OP_CALL);
397 if (r) return r;
398 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
399 node->target);
400 if (r) return r;
401 r = add_abs_addr(reg, 0 /*dummy addr.*/);
402 return r;
404 #endif
406 static int
407 compile_tree_n_times(Node* node, int n, regex_t* reg)
409 int i, r;
411 for (i = 0; i < n; i++) {
412 r = compile_tree(node, reg);
413 if (r) return r;
415 return 0;
418 static int
419 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
420 regex_t* reg ARG_UNUSED, int ignore_case)
422 int len;
423 int op = select_str_opcode(mb_len, str_len, ignore_case);
425 len = SIZE_OPCODE;
427 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
428 if (IS_NEED_STR_LEN_OP_EXACT(op))
429 len += SIZE_LENGTH;
431 len += mb_len * str_len;
432 return len;
435 static int
436 add_compile_string(UChar* s, int mb_len, int str_len,
437 regex_t* reg, int ignore_case)
439 int op = select_str_opcode(mb_len, str_len, ignore_case);
440 add_opcode(reg, op);
442 if (op == OP_EXACTMBN)
443 add_length(reg, mb_len);
445 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
446 if (op == OP_EXACTN_IC)
447 add_length(reg, mb_len * str_len);
448 else
449 add_length(reg, str_len);
452 add_bytes(reg, s, mb_len * str_len);
453 return 0;
457 static int
458 compile_length_string_node(Node* node, regex_t* reg)
460 int rlen, r, len, prev_len, slen, ambig;
461 OnigEncoding enc = reg->enc;
462 UChar *p, *prev;
463 StrNode* sn;
465 sn = NSTR(node);
466 if (sn->end <= sn->s)
467 return 0;
469 ambig = NSTRING_IS_AMBIG(node);
471 p = prev = sn->s;
472 prev_len = enclen(enc, p, sn->end);
473 p += prev_len;
474 slen = 1;
475 rlen = 0;
477 for (; p < sn->end; ) {
478 len = enclen(enc, p, sn->end);
479 if (len == prev_len) {
480 slen++;
482 else {
483 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
484 rlen += r;
485 prev = p;
486 slen = 1;
487 prev_len = len;
489 p += len;
491 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
492 rlen += r;
493 return rlen;
496 static int
497 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
499 if (sn->end <= sn->s)
500 return 0;
502 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
505 static int
506 compile_string_node(Node* node, regex_t* reg)
508 int r, len, prev_len, slen, ambig;
509 OnigEncoding enc = reg->enc;
510 UChar *p, *prev, *end;
511 StrNode* sn;
513 sn = NSTR(node);
514 if (sn->end <= sn->s)
515 return 0;
517 end = sn->end;
518 ambig = NSTRING_IS_AMBIG(node);
520 p = prev = sn->s;
521 prev_len = enclen(enc, p, end);
522 p += prev_len;
523 slen = 1;
525 for (; p < end; ) {
526 len = enclen(enc, p, end);
527 if (len == prev_len) {
528 slen++;
530 else {
531 r = add_compile_string(prev, prev_len, slen, reg, ambig);
532 if (r) return r;
534 prev = p;
535 slen = 1;
536 prev_len = len;
539 p += len;
541 return add_compile_string(prev, prev_len, slen, reg, ambig);
544 static int
545 compile_string_raw_node(StrNode* sn, regex_t* reg)
547 if (sn->end <= sn->s)
548 return 0;
550 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
553 static int
554 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
556 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
557 add_length(reg, mbuf->used);
558 return add_bytes(reg, mbuf->p, mbuf->used);
559 #else
560 int r, pad_size;
561 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
563 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
564 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
565 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
567 r = add_bytes(reg, mbuf->p, mbuf->used);
569 /* padding for return value from compile_length_cclass_node() to be fix. */
570 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
571 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
572 return r;
573 #endif
576 static int
577 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
579 int len;
581 if (IS_NCCLASS_SHARE(cc)) {
582 len = SIZE_OPCODE + SIZE_POINTER;
583 return len;
586 if (IS_NULL(cc->mbuf)) {
587 len = SIZE_OPCODE + SIZE_BITSET;
589 else {
590 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
591 len = SIZE_OPCODE;
593 else {
594 len = SIZE_OPCODE + SIZE_BITSET;
596 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
597 len += SIZE_LENGTH + cc->mbuf->used;
598 #else
599 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
600 #endif
603 return len;
606 static int
607 compile_cclass_node(CClassNode* cc, regex_t* reg)
609 int r;
611 if (IS_NCCLASS_SHARE(cc)) {
612 add_opcode(reg, OP_CCLASS_NODE);
613 r = add_pointer(reg, cc);
614 return r;
617 if (IS_NULL(cc->mbuf)) {
618 if (IS_NCCLASS_NOT(cc))
619 add_opcode(reg, OP_CCLASS_NOT);
620 else
621 add_opcode(reg, OP_CCLASS);
623 r = add_bitset(reg, cc->bs);
625 else {
626 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
627 if (IS_NCCLASS_NOT(cc))
628 add_opcode(reg, OP_CCLASS_MB_NOT);
629 else
630 add_opcode(reg, OP_CCLASS_MB);
632 r = add_multi_byte_cclass(cc->mbuf, reg);
634 else {
635 if (IS_NCCLASS_NOT(cc))
636 add_opcode(reg, OP_CCLASS_MIX_NOT);
637 else
638 add_opcode(reg, OP_CCLASS_MIX);
640 r = add_bitset(reg, cc->bs);
641 if (r) return r;
642 r = add_multi_byte_cclass(cc->mbuf, reg);
646 return r;
649 static int
650 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
652 #define REPEAT_RANGE_ALLOC 4
654 OnigRepeatRange* p;
656 if (reg->repeat_range_alloc == 0) {
657 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
658 CHECK_NULL_RETURN_MEMERR(p);
659 reg->repeat_range = p;
660 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
662 else if (reg->repeat_range_alloc <= id) {
663 int n;
664 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
665 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
666 sizeof(OnigRepeatRange) * n);
667 CHECK_NULL_RETURN_MEMERR(p);
668 reg->repeat_range = p;
669 reg->repeat_range_alloc = n;
671 else {
672 p = reg->repeat_range;
675 p[id].lower = lower;
676 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
677 return 0;
680 static int
681 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
682 regex_t* reg)
684 int r;
685 int num_repeat = reg->num_repeat;
687 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
688 if (r) return r;
689 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
690 reg->num_repeat++;
691 if (r) return r;
692 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
693 if (r) return r;
695 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
696 if (r) return r;
698 r = compile_tree_empty_check(qn->target, reg, empty_info);
699 if (r) return r;
701 if (
702 #ifdef USE_SUBEXP_CALL
703 reg->num_call > 0 ||
704 #endif
705 IS_QUANTIFIER_IN_REPEAT(qn)) {
706 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
708 else {
709 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
711 if (r) return r;
712 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
713 return r;
716 static int
717 is_anychar_star_quantifier(QtfrNode* qn)
719 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
720 NTYPE(qn->target) == NT_CANY)
721 return 1;
722 else
723 return 0;
726 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
727 #define CKN_ON (ckn > 0)
729 #ifdef USE_COMBINATION_EXPLOSION_CHECK
731 static int
732 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
734 int len, mod_tlen, cklen;
735 int ckn;
736 int infinite = IS_REPEAT_INFINITE(qn->upper);
737 int empty_info = qn->target_empty_info;
738 int tlen = compile_length_tree(qn->target, reg);
740 if (tlen < 0) return tlen;
742 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
744 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
746 /* anychar repeat */
747 if (NTYPE(qn->target) == NT_CANY) {
748 if (qn->greedy && infinite) {
749 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
750 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
751 else
752 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
756 if (empty_info != 0)
757 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
758 else
759 mod_tlen = tlen;
761 if (infinite && qn->lower <= 1) {
762 if (qn->greedy) {
763 if (qn->lower == 1)
764 len = SIZE_OP_JUMP;
765 else
766 len = 0;
768 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
770 else {
771 if (qn->lower == 0)
772 len = SIZE_OP_JUMP;
773 else
774 len = 0;
776 len += mod_tlen + SIZE_OP_PUSH + cklen;
779 else if (qn->upper == 0) {
780 if (qn->is_refered != 0) /* /(?<n>..){0}/ */
781 len = SIZE_OP_JUMP + tlen;
782 else
783 len = 0;
785 else if (qn->upper == 1 && qn->greedy) {
786 if (qn->lower == 0) {
787 if (CKN_ON) {
788 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
790 else {
791 len = SIZE_OP_PUSH + tlen;
794 else {
795 len = tlen;
798 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
799 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
801 else {
802 len = SIZE_OP_REPEAT_INC
803 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
804 if (CKN_ON)
805 len += SIZE_OP_STATE_CHECK;
808 return len;
811 static int
812 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
814 int r, mod_tlen;
815 int ckn;
816 int infinite = IS_REPEAT_INFINITE(qn->upper);
817 int empty_info = qn->target_empty_info;
818 int tlen = compile_length_tree(qn->target, reg);
820 if (tlen < 0) return tlen;
822 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
824 if (is_anychar_star_quantifier(qn)) {
825 r = compile_tree_n_times(qn->target, qn->lower, reg);
826 if (r) return r;
827 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
828 if (IS_MULTILINE(reg->options))
829 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
830 else
831 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
832 if (r) return r;
833 if (CKN_ON) {
834 r = add_state_check_num(reg, ckn);
835 if (r) return r;
838 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
840 else {
841 if (IS_MULTILINE(reg->options)) {
842 r = add_opcode(reg, (CKN_ON ?
843 OP_STATE_CHECK_ANYCHAR_ML_STAR
844 : OP_ANYCHAR_ML_STAR));
846 else {
847 r = add_opcode(reg, (CKN_ON ?
848 OP_STATE_CHECK_ANYCHAR_STAR
849 : OP_ANYCHAR_STAR));
851 if (r) return r;
852 if (CKN_ON)
853 r = add_state_check_num(reg, ckn);
855 return r;
859 if (empty_info != 0)
860 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
861 else
862 mod_tlen = tlen;
864 if (infinite && qn->lower <= 1) {
865 if (qn->greedy) {
866 if (qn->lower == 1) {
867 r = add_opcode_rel_addr(reg, OP_JUMP,
868 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
869 if (r) return r;
872 if (CKN_ON) {
873 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
874 if (r) return r;
875 r = add_state_check_num(reg, ckn);
876 if (r) return r;
877 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
879 else {
880 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
882 if (r) return r;
883 r = compile_tree_empty_check(qn->target, reg, empty_info);
884 if (r) return r;
885 r = add_opcode_rel_addr(reg, OP_JUMP,
886 -(mod_tlen + (int )SIZE_OP_JUMP
887 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
889 else {
890 if (qn->lower == 0) {
891 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
892 if (r) return r;
894 r = compile_tree_empty_check(qn->target, reg, empty_info);
895 if (r) return r;
896 if (CKN_ON) {
897 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
898 if (r) return r;
899 r = add_state_check_num(reg, ckn);
900 if (r) return r;
901 r = add_rel_addr(reg,
902 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
904 else
905 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
908 else if (qn->upper == 0) {
909 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
910 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
911 if (r) return r;
912 r = compile_tree(qn->target, reg);
914 else
915 r = 0;
917 else if (qn->upper == 1 && qn->greedy) {
918 if (qn->lower == 0) {
919 if (CKN_ON) {
920 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
921 if (r) return r;
922 r = add_state_check_num(reg, ckn);
923 if (r) return r;
924 r = add_rel_addr(reg, tlen);
926 else {
927 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
929 if (r) return r;
932 r = compile_tree(qn->target, reg);
934 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
935 if (CKN_ON) {
936 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
937 if (r) return r;
938 r = add_state_check_num(reg, ckn);
939 if (r) return r;
940 r = add_rel_addr(reg, SIZE_OP_JUMP);
942 else {
943 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
946 if (r) return r;
947 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
948 if (r) return r;
949 r = compile_tree(qn->target, reg);
951 else {
952 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
953 if (CKN_ON) {
954 if (r) return r;
955 r = add_opcode(reg, OP_STATE_CHECK);
956 if (r) return r;
957 r = add_state_check_num(reg, ckn);
960 return r;
963 #else /* USE_COMBINATION_EXPLOSION_CHECK */
965 static int
966 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
968 int len, mod_tlen;
969 int infinite = IS_REPEAT_INFINITE(qn->upper);
970 int empty_info = qn->target_empty_info;
971 int tlen = compile_length_tree(qn->target, reg);
973 if (tlen < 0) return tlen;
975 /* anychar repeat */
976 if (NTYPE(qn->target) == NT_CANY) {
977 if (qn->greedy && infinite) {
978 if (IS_NOT_NULL(qn->next_head_exact))
979 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
980 else
981 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
985 if (empty_info != 0)
986 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
987 else
988 mod_tlen = tlen;
990 if (infinite &&
991 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
992 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
993 len = SIZE_OP_JUMP;
995 else {
996 len = tlen * qn->lower;
999 if (qn->greedy) {
1000 if (IS_NOT_NULL(qn->head_exact))
1001 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1002 else if (IS_NOT_NULL(qn->next_head_exact))
1003 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1004 else
1005 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1007 else
1008 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1010 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1011 len = SIZE_OP_JUMP + tlen;
1013 else if (!infinite && qn->greedy &&
1014 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1015 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1016 len = tlen * qn->lower;
1017 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1019 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1020 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1022 else {
1023 len = SIZE_OP_REPEAT_INC
1024 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1027 return len;
1030 static int
1031 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1033 int i, r, mod_tlen;
1034 int infinite = IS_REPEAT_INFINITE(qn->upper);
1035 int empty_info = qn->target_empty_info;
1036 int tlen = compile_length_tree(qn->target, reg);
1038 if (tlen < 0) return tlen;
1040 if (is_anychar_star_quantifier(qn)) {
1041 r = compile_tree_n_times(qn->target, qn->lower, reg);
1042 if (r) return r;
1043 if (IS_NOT_NULL(qn->next_head_exact)) {
1044 if (IS_MULTILINE(reg->options))
1045 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1046 else
1047 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1048 if (r) return r;
1049 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1051 else {
1052 if (IS_MULTILINE(reg->options))
1053 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1054 else
1055 return add_opcode(reg, OP_ANYCHAR_STAR);
1059 if (empty_info != 0)
1060 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1061 else
1062 mod_tlen = tlen;
1064 if (infinite &&
1065 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1066 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1067 if (qn->greedy) {
1068 if (IS_NOT_NULL(qn->head_exact))
1069 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1070 else if (IS_NOT_NULL(qn->next_head_exact))
1071 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1072 else
1073 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1075 else {
1076 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1078 if (r) return r;
1080 else {
1081 r = compile_tree_n_times(qn->target, qn->lower, reg);
1082 if (r) return r;
1085 if (qn->greedy) {
1086 if (IS_NOT_NULL(qn->head_exact)) {
1087 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1088 mod_tlen + SIZE_OP_JUMP);
1089 if (r) return r;
1090 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1091 r = compile_tree_empty_check(qn->target, reg, empty_info);
1092 if (r) return r;
1093 r = add_opcode_rel_addr(reg, OP_JUMP,
1094 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1096 else if (IS_NOT_NULL(qn->next_head_exact)) {
1097 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1098 mod_tlen + SIZE_OP_JUMP);
1099 if (r) return r;
1100 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1101 r = compile_tree_empty_check(qn->target, reg, empty_info);
1102 if (r) return r;
1103 r = add_opcode_rel_addr(reg, OP_JUMP,
1104 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1106 else {
1107 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1108 if (r) return r;
1109 r = compile_tree_empty_check(qn->target, reg, empty_info);
1110 if (r) return r;
1111 r = add_opcode_rel_addr(reg, OP_JUMP,
1112 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1115 else {
1116 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1117 if (r) return r;
1118 r = compile_tree_empty_check(qn->target, reg, empty_info);
1119 if (r) return r;
1120 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1123 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1124 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1125 if (r) return r;
1126 r = compile_tree(qn->target, reg);
1128 else if (!infinite && qn->greedy &&
1129 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1130 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1131 int n = qn->upper - qn->lower;
1133 r = compile_tree_n_times(qn->target, qn->lower, reg);
1134 if (r) return r;
1136 for (i = 0; i < n; i++) {
1137 r = add_opcode_rel_addr(reg, OP_PUSH,
1138 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1139 if (r) return r;
1140 r = compile_tree(qn->target, reg);
1141 if (r) return r;
1144 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1145 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1146 if (r) return r;
1147 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148 if (r) return r;
1149 r = compile_tree(qn->target, reg);
1151 else {
1152 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1154 return r;
1156 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1158 static int
1159 compile_length_option_node(EncloseNode* node, regex_t* reg)
1161 int tlen;
1162 OnigOptionType prev = reg->options;
1164 reg->options = node->option;
1165 tlen = compile_length_tree(node->target, reg);
1166 reg->options = prev;
1168 if (tlen < 0) return tlen;
1170 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1171 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1172 + tlen + SIZE_OP_SET_OPTION;
1174 else
1175 return tlen;
1178 static int
1179 compile_option_node(EncloseNode* node, regex_t* reg)
1181 int r;
1182 OnigOptionType prev = reg->options;
1184 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1185 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1186 if (r) return r;
1187 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1188 if (r) return r;
1189 r = add_opcode(reg, OP_FAIL);
1190 if (r) return r;
1193 reg->options = node->option;
1194 r = compile_tree(node->target, reg);
1195 reg->options = prev;
1197 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1198 if (r) return r;
1199 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1201 return r;
1204 static int
1205 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1207 int len;
1208 int tlen;
1210 if (node->type == ENCLOSE_OPTION)
1211 return compile_length_option_node(node, reg);
1213 if (node->target) {
1214 tlen = compile_length_tree(node->target, reg);
1215 if (tlen < 0) return tlen;
1217 else
1218 tlen = 0;
1220 switch (node->type) {
1221 case ENCLOSE_MEMORY:
1222 #ifdef USE_SUBEXP_CALL
1223 if (IS_ENCLOSE_CALLED(node)) {
1224 len = SIZE_OP_MEMORY_START_PUSH + tlen
1225 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1226 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1227 len += (IS_ENCLOSE_RECURSION(node)
1228 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1229 else
1230 len += (IS_ENCLOSE_RECURSION(node)
1231 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1233 else
1234 #endif
1236 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1237 len = SIZE_OP_MEMORY_START_PUSH;
1238 else
1239 len = SIZE_OP_MEMORY_START;
1241 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1242 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1244 break;
1246 case ENCLOSE_STOP_BACKTRACK:
1247 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1248 QtfrNode* qn = NQTFR(node->target);
1249 tlen = compile_length_tree(qn->target, reg);
1250 if (tlen < 0) return tlen;
1252 len = tlen * qn->lower
1253 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1255 else {
1256 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1258 break;
1260 default:
1261 return ONIGERR_TYPE_BUG;
1262 break;
1265 return len;
1268 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1270 static int
1271 compile_enclose_node(EncloseNode* node, regex_t* reg)
1273 int r, len;
1275 if (node->type == ENCLOSE_OPTION)
1276 return compile_option_node(node, reg);
1278 switch (node->type) {
1279 case ENCLOSE_MEMORY:
1280 #ifdef USE_SUBEXP_CALL
1281 if (IS_ENCLOSE_CALLED(node)) {
1282 r = add_opcode(reg, OP_CALL);
1283 if (r) return r;
1284 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1285 node->state |= NST_ADDR_FIXED;
1286 r = add_abs_addr(reg, (int )node->call_addr);
1287 if (r) return r;
1288 len = compile_length_tree(node->target, reg);
1289 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1290 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1291 len += (IS_ENCLOSE_RECURSION(node)
1292 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1293 else
1294 len += (IS_ENCLOSE_RECURSION(node)
1295 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1297 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1298 if (r) return r;
1300 #endif
1301 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1302 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1303 else
1304 r = add_opcode(reg, OP_MEMORY_START);
1305 if (r) return r;
1306 r = add_mem_num(reg, node->regnum);
1307 if (r) return r;
1308 r = compile_tree(node->target, reg);
1309 if (r) return r;
1310 #ifdef USE_SUBEXP_CALL
1311 if (IS_ENCLOSE_CALLED(node)) {
1312 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1313 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1314 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1315 else
1316 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1317 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1319 if (r) return r;
1320 r = add_mem_num(reg, node->regnum);
1321 if (r) return r;
1322 r = add_opcode(reg, OP_RETURN);
1324 else
1325 #endif
1327 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1328 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1329 else
1330 r = add_opcode(reg, OP_MEMORY_END);
1331 if (r) return r;
1332 r = add_mem_num(reg, node->regnum);
1334 break;
1336 case ENCLOSE_STOP_BACKTRACK:
1337 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1338 QtfrNode* qn = NQTFR(node->target);
1339 r = compile_tree_n_times(qn->target, qn->lower, reg);
1340 if (r) return r;
1342 len = compile_length_tree(qn->target, reg);
1343 if (len < 0) return len;
1345 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1346 if (r) return r;
1347 r = compile_tree(qn->target, reg);
1348 if (r) return r;
1349 r = add_opcode(reg, OP_POP);
1350 if (r) return r;
1351 r = add_opcode_rel_addr(reg, OP_JUMP,
1352 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1354 else {
1355 r = add_opcode(reg, OP_PUSH_STOP_BT);
1356 if (r) return r;
1357 r = compile_tree(node->target, reg);
1358 if (r) return r;
1359 r = add_opcode(reg, OP_POP_STOP_BT);
1361 break;
1363 default:
1364 return ONIGERR_TYPE_BUG;
1365 break;
1368 return r;
1371 static int
1372 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1374 int len;
1375 int tlen = 0;
1377 if (node->target) {
1378 tlen = compile_length_tree(node->target, reg);
1379 if (tlen < 0) return tlen;
1382 switch (node->type) {
1383 case ANCHOR_PREC_READ:
1384 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1385 break;
1386 case ANCHOR_PREC_READ_NOT:
1387 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1388 break;
1389 case ANCHOR_LOOK_BEHIND:
1390 len = SIZE_OP_LOOK_BEHIND + tlen;
1391 break;
1392 case ANCHOR_LOOK_BEHIND_NOT:
1393 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1394 break;
1396 default:
1397 len = SIZE_OPCODE;
1398 break;
1401 return len;
1404 static int
1405 compile_anchor_node(AnchorNode* node, regex_t* reg)
1407 int r, len;
1409 switch (node->type) {
1410 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1411 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1412 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1413 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1414 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1415 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1417 case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1418 case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1419 #ifdef USE_WORD_BEGIN_END
1420 case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1421 case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1422 #endif
1424 case ANCHOR_PREC_READ:
1425 r = add_opcode(reg, OP_PUSH_POS);
1426 if (r) return r;
1427 r = compile_tree(node->target, reg);
1428 if (r) return r;
1429 r = add_opcode(reg, OP_POP_POS);
1430 break;
1432 case ANCHOR_PREC_READ_NOT:
1433 len = compile_length_tree(node->target, reg);
1434 if (len < 0) return len;
1435 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1436 if (r) return r;
1437 r = compile_tree(node->target, reg);
1438 if (r) return r;
1439 r = add_opcode(reg, OP_FAIL_POS);
1440 break;
1442 case ANCHOR_LOOK_BEHIND:
1444 int n;
1445 r = add_opcode(reg, OP_LOOK_BEHIND);
1446 if (r) return r;
1447 if (node->char_len < 0) {
1448 r = get_char_length_tree(node->target, reg, &n);
1449 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1451 else
1452 n = node->char_len;
1453 r = add_length(reg, n);
1454 if (r) return r;
1455 r = compile_tree(node->target, reg);
1457 break;
1459 case ANCHOR_LOOK_BEHIND_NOT:
1461 int n;
1462 len = compile_length_tree(node->target, reg);
1463 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1464 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1465 if (r) return r;
1466 if (node->char_len < 0) {
1467 r = get_char_length_tree(node->target, reg, &n);
1468 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1470 else
1471 n = node->char_len;
1472 r = add_length(reg, n);
1473 if (r) return r;
1474 r = compile_tree(node->target, reg);
1475 if (r) return r;
1476 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1478 break;
1480 default:
1481 return ONIGERR_TYPE_BUG;
1482 break;
1485 return r;
1488 static int
1489 compile_length_tree(Node* node, regex_t* reg)
1491 int len, type, r;
1493 type = NTYPE(node);
1494 switch (type) {
1495 case NT_LIST:
1496 len = 0;
1497 do {
1498 r = compile_length_tree(NCAR(node), reg);
1499 if (r < 0) return r;
1500 len += r;
1501 } while (IS_NOT_NULL(node = NCDR(node)));
1502 r = len;
1503 break;
1505 case NT_ALT:
1507 int n;
1509 n = r = 0;
1510 do {
1511 r += compile_length_tree(NCAR(node), reg);
1512 n++;
1513 } while (IS_NOT_NULL(node = NCDR(node)));
1514 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1516 break;
1518 case NT_STR:
1519 if (NSTRING_IS_RAW(node))
1520 r = compile_length_string_raw_node(NSTR(node), reg);
1521 else
1522 r = compile_length_string_node(node, reg);
1523 break;
1525 case NT_CCLASS:
1526 r = compile_length_cclass_node(NCCLASS(node), reg);
1527 break;
1529 case NT_CTYPE:
1530 case NT_CANY:
1531 r = SIZE_OPCODE;
1532 break;
1534 case NT_BREF:
1536 BRefNode* br = NBREF(node);
1538 #ifdef USE_BACKREF_WITH_LEVEL
1539 if (IS_BACKREF_NEST_LEVEL(br)) {
1540 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1541 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1543 else
1544 #endif
1545 if (br->back_num == 1) {
1546 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1547 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1549 else {
1550 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1553 break;
1555 #ifdef USE_SUBEXP_CALL
1556 case NT_CALL:
1557 r = SIZE_OP_CALL;
1558 break;
1559 #endif
1561 case NT_QTFR:
1562 r = compile_length_quantifier_node(NQTFR(node), reg);
1563 break;
1565 case NT_ENCLOSE:
1566 r = compile_length_enclose_node(NENCLOSE(node), reg);
1567 break;
1569 case NT_ANCHOR:
1570 r = compile_length_anchor_node(NANCHOR(node), reg);
1571 break;
1573 default:
1574 return ONIGERR_TYPE_BUG;
1575 break;
1578 return r;
1581 static int
1582 compile_tree(Node* node, regex_t* reg)
1584 int n, type, len, pos, r = 0;
1586 type = NTYPE(node);
1587 switch (type) {
1588 case NT_LIST:
1589 do {
1590 r = compile_tree(NCAR(node), reg);
1591 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1592 break;
1594 case NT_ALT:
1596 Node* x = node;
1597 len = 0;
1598 do {
1599 len += compile_length_tree(NCAR(x), reg);
1600 if (NCDR(x) != NULL) {
1601 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1603 } while (IS_NOT_NULL(x = NCDR(x)));
1604 pos = reg->used + len; /* goal position */
1606 do {
1607 len = compile_length_tree(NCAR(node), reg);
1608 if (IS_NOT_NULL(NCDR(node))) {
1609 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1610 if (r) break;
1612 r = compile_tree(NCAR(node), reg);
1613 if (r) break;
1614 if (IS_NOT_NULL(NCDR(node))) {
1615 len = pos - (reg->used + SIZE_OP_JUMP);
1616 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1617 if (r) break;
1619 } while (IS_NOT_NULL(node = NCDR(node)));
1621 break;
1623 case NT_STR:
1624 if (NSTRING_IS_RAW(node))
1625 r = compile_string_raw_node(NSTR(node), reg);
1626 else
1627 r = compile_string_node(node, reg);
1628 break;
1630 case NT_CCLASS:
1631 r = compile_cclass_node(NCCLASS(node), reg);
1632 break;
1634 case NT_CTYPE:
1636 int op;
1638 switch (NCTYPE(node)->ctype) {
1639 case ONIGENC_CTYPE_WORD:
1640 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1641 else op = OP_WORD;
1642 break;
1643 default:
1644 return ONIGERR_TYPE_BUG;
1645 break;
1647 r = add_opcode(reg, op);
1649 break;
1651 case NT_CANY:
1652 if (IS_MULTILINE(reg->options))
1653 r = add_opcode(reg, OP_ANYCHAR_ML);
1654 else
1655 r = add_opcode(reg, OP_ANYCHAR);
1656 break;
1658 case NT_BREF:
1660 BRefNode* br = NBREF(node);
1662 #ifdef USE_BACKREF_WITH_LEVEL
1663 if (IS_BACKREF_NEST_LEVEL(br)) {
1664 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1665 if (r) return r;
1666 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1667 if (r) return r;
1668 r = add_length(reg, br->nest_level);
1669 if (r) return r;
1671 goto add_bacref_mems;
1673 else
1674 #endif
1675 if (br->back_num == 1) {
1676 n = br->back_static[0];
1677 if (IS_IGNORECASE(reg->options)) {
1678 r = add_opcode(reg, OP_BACKREFN_IC);
1679 if (r) return r;
1680 r = add_mem_num(reg, n);
1682 else {
1683 switch (n) {
1684 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1685 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1686 default:
1687 r = add_opcode(reg, OP_BACKREFN);
1688 if (r) return r;
1689 r = add_mem_num(reg, n);
1690 break;
1694 else {
1695 int i;
1696 int* p;
1698 if (IS_IGNORECASE(reg->options)) {
1699 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1701 else {
1702 r = add_opcode(reg, OP_BACKREF_MULTI);
1704 if (r) return r;
1706 #ifdef USE_BACKREF_WITH_LEVEL
1707 add_bacref_mems:
1708 #endif
1709 r = add_length(reg, br->back_num);
1710 if (r) return r;
1711 p = BACKREFS_P(br);
1712 for (i = br->back_num - 1; i >= 0; i--) {
1713 r = add_mem_num(reg, p[i]);
1714 if (r) return r;
1718 break;
1720 #ifdef USE_SUBEXP_CALL
1721 case NT_CALL:
1722 r = compile_call(NCALL(node), reg);
1723 break;
1724 #endif
1726 case NT_QTFR:
1727 r = compile_quantifier_node(NQTFR(node), reg);
1728 break;
1730 case NT_ENCLOSE:
1731 r = compile_enclose_node(NENCLOSE(node), reg);
1732 break;
1734 case NT_ANCHOR:
1735 r = compile_anchor_node(NANCHOR(node), reg);
1736 break;
1738 default:
1739 #ifdef ONIG_DEBUG
1740 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1741 #endif
1742 break;
1745 return r;
1748 #ifdef USE_NAMED_GROUP
1750 static int
1751 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1753 int r = 0;
1754 Node* node = *plink;
1756 switch (NTYPE(node)) {
1757 case NT_LIST:
1758 case NT_ALT:
1759 do {
1760 r = noname_disable_map(&(NCAR(node)), map, counter);
1761 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1762 break;
1764 case NT_QTFR:
1766 Node** ptarget = &(NQTFR(node)->target);
1767 Node* old = *ptarget;
1768 r = noname_disable_map(ptarget, map, counter);
1769 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1770 onig_reduce_nested_quantifier(node, *ptarget);
1773 break;
1775 case NT_ENCLOSE:
1777 EncloseNode* en = NENCLOSE(node);
1778 if (en->type == ENCLOSE_MEMORY) {
1779 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1780 (*counter)++;
1781 map[en->regnum].new_val = *counter;
1782 en->regnum = *counter;
1783 r = noname_disable_map(&(en->target), map, counter);
1785 else {
1786 *plink = en->target;
1787 en->target = NULL_NODE;
1788 onig_node_free(node);
1789 r = noname_disable_map(plink, map, counter);
1792 else
1793 r = noname_disable_map(&(en->target), map, counter);
1795 break;
1797 default:
1798 break;
1801 return r;
1804 static int
1805 renumber_node_backref(Node* node, GroupNumRemap* map)
1807 int i, pos, n, old_num;
1808 int *backs;
1809 BRefNode* bn = NBREF(node);
1811 if (! IS_BACKREF_NAME_REF(bn))
1812 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1814 old_num = bn->back_num;
1815 if (IS_NULL(bn->back_dynamic))
1816 backs = bn->back_static;
1817 else
1818 backs = bn->back_dynamic;
1820 for (i = 0, pos = 0; i < old_num; i++) {
1821 n = map[backs[i]].new_val;
1822 if (n > 0) {
1823 backs[pos] = n;
1824 pos++;
1828 bn->back_num = pos;
1829 return 0;
1832 static int
1833 renumber_by_map(Node* node, GroupNumRemap* map)
1835 int r = 0;
1837 switch (NTYPE(node)) {
1838 case NT_LIST:
1839 case NT_ALT:
1840 do {
1841 r = renumber_by_map(NCAR(node), map);
1842 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1843 break;
1844 case NT_QTFR:
1845 r = renumber_by_map(NQTFR(node)->target, map);
1846 break;
1847 case NT_ENCLOSE:
1848 r = renumber_by_map(NENCLOSE(node)->target, map);
1849 break;
1851 case NT_BREF:
1852 r = renumber_node_backref(node, map);
1853 break;
1855 default:
1856 break;
1859 return r;
1862 static int
1863 numbered_ref_check(Node* node)
1865 int r = 0;
1867 switch (NTYPE(node)) {
1868 case NT_LIST:
1869 case NT_ALT:
1870 do {
1871 r = numbered_ref_check(NCAR(node));
1872 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1873 break;
1874 case NT_QTFR:
1875 r = numbered_ref_check(NQTFR(node)->target);
1876 break;
1877 case NT_ENCLOSE:
1878 r = numbered_ref_check(NENCLOSE(node)->target);
1879 break;
1881 case NT_BREF:
1882 if (! IS_BACKREF_NAME_REF(NBREF(node)))
1883 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1884 break;
1886 default:
1887 break;
1890 return r;
1893 static int
1894 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1896 int r, i, pos, counter;
1897 BitStatusType loc;
1898 GroupNumRemap* map;
1900 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1901 CHECK_NULL_RETURN_MEMERR(map);
1902 for (i = 1; i <= env->num_mem; i++) {
1903 map[i].new_val = 0;
1905 counter = 0;
1906 r = noname_disable_map(root, map, &counter);
1907 if (r != 0) return r;
1909 r = renumber_by_map(*root, map);
1910 if (r != 0) return r;
1912 for (i = 1, pos = 1; i <= env->num_mem; i++) {
1913 if (map[i].new_val > 0) {
1914 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1915 pos++;
1919 loc = env->capture_history;
1920 BIT_STATUS_CLEAR(env->capture_history);
1921 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1922 if (BIT_STATUS_AT(loc, i)) {
1923 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1927 env->num_mem = env->num_named;
1928 reg->num_mem = env->num_named;
1930 return onig_renumber_name_table(reg, map);
1932 #endif /* USE_NAMED_GROUP */
1934 #ifdef USE_SUBEXP_CALL
1935 static int
1936 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1938 int i, offset;
1939 EncloseNode* en;
1940 AbsAddrType addr;
1942 for (i = 0; i < uslist->num; i++) {
1943 en = NENCLOSE(uslist->us[i].target);
1944 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1945 addr = en->call_addr;
1946 offset = uslist->us[i].offset;
1948 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1950 return 0;
1952 #endif
1954 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1955 static int
1956 quantifiers_memory_node_info(Node* node)
1958 int r = 0;
1960 switch (NTYPE(node)) {
1961 case NT_LIST:
1962 case NT_ALT:
1964 int v;
1965 do {
1966 v = quantifiers_memory_node_info(NCAR(node));
1967 if (v > r) r = v;
1968 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1970 break;
1972 #ifdef USE_SUBEXP_CALL
1973 case NT_CALL:
1974 if (IS_CALL_RECURSION(NCALL(node))) {
1975 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1977 else
1978 r = quantifiers_memory_node_info(NCALL(node)->target);
1979 break;
1980 #endif
1982 case NT_QTFR:
1984 QtfrNode* qn = NQTFR(node);
1985 if (qn->upper != 0) {
1986 r = quantifiers_memory_node_info(qn->target);
1989 break;
1991 case NT_ENCLOSE:
1993 EncloseNode* en = NENCLOSE(node);
1994 switch (en->type) {
1995 case ENCLOSE_MEMORY:
1996 return NQ_TARGET_IS_EMPTY_MEM;
1997 break;
1999 case ENCLOSE_OPTION:
2000 case ENCLOSE_STOP_BACKTRACK:
2001 r = quantifiers_memory_node_info(en->target);
2002 break;
2003 default:
2004 break;
2007 break;
2009 case NT_BREF:
2010 case NT_STR:
2011 case NT_CTYPE:
2012 case NT_CCLASS:
2013 case NT_CANY:
2014 case NT_ANCHOR:
2015 default:
2016 break;
2019 return r;
2021 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2023 static int
2024 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2026 OnigDistance tmin;
2027 int r = 0;
2029 *min = 0;
2030 switch (NTYPE(node)) {
2031 case NT_BREF:
2033 int i;
2034 int* backs;
2035 Node** nodes = SCANENV_MEM_NODES(env);
2036 BRefNode* br = NBREF(node);
2037 if (br->state & NST_RECURSION) break;
2039 backs = BACKREFS_P(br);
2040 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2041 r = get_min_match_length(nodes[backs[0]], min, env);
2042 if (r != 0) break;
2043 for (i = 1; i < br->back_num; i++) {
2044 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2045 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2046 if (r != 0) break;
2047 if (*min > tmin) *min = tmin;
2050 break;
2052 #ifdef USE_SUBEXP_CALL
2053 case NT_CALL:
2054 if (IS_CALL_RECURSION(NCALL(node))) {
2055 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2056 if (IS_ENCLOSE_MIN_FIXED(en))
2057 *min = en->min_len;
2059 else
2060 r = get_min_match_length(NCALL(node)->target, min, env);
2061 break;
2062 #endif
2064 case NT_LIST:
2065 do {
2066 r = get_min_match_length(NCAR(node), &tmin, env);
2067 if (r == 0) *min += tmin;
2068 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2069 break;
2071 case NT_ALT:
2073 Node *x, *y;
2074 y = node;
2075 do {
2076 x = NCAR(y);
2077 r = get_min_match_length(x, &tmin, env);
2078 if (r != 0) break;
2079 if (y == node) *min = tmin;
2080 else if (*min > tmin) *min = tmin;
2081 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2083 break;
2085 case NT_STR:
2087 StrNode* sn = NSTR(node);
2088 *min = sn->end - sn->s;
2090 break;
2092 case NT_CTYPE:
2093 *min = 1;
2094 break;
2096 case NT_CCLASS:
2097 case NT_CANY:
2098 *min = 1;
2099 break;
2101 case NT_QTFR:
2103 QtfrNode* qn = NQTFR(node);
2105 if (qn->lower > 0) {
2106 r = get_min_match_length(qn->target, min, env);
2107 if (r == 0)
2108 *min = distance_multiply(*min, qn->lower);
2111 break;
2113 case NT_ENCLOSE:
2115 EncloseNode* en = NENCLOSE(node);
2116 switch (en->type) {
2117 case ENCLOSE_MEMORY:
2118 #ifdef USE_SUBEXP_CALL
2119 if (IS_ENCLOSE_MIN_FIXED(en))
2120 *min = en->min_len;
2121 else {
2122 r = get_min_match_length(en->target, min, env);
2123 if (r == 0) {
2124 en->min_len = *min;
2125 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2128 break;
2129 #endif
2130 case ENCLOSE_OPTION:
2131 case ENCLOSE_STOP_BACKTRACK:
2132 r = get_min_match_length(en->target, min, env);
2133 break;
2136 break;
2138 case NT_ANCHOR:
2139 default:
2140 break;
2143 return r;
2146 static int
2147 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2149 OnigDistance tmax;
2150 int r = 0;
2152 *max = 0;
2153 switch (NTYPE(node)) {
2154 case NT_LIST:
2155 do {
2156 r = get_max_match_length(NCAR(node), &tmax, env);
2157 if (r == 0)
2158 *max = distance_add(*max, tmax);
2159 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2160 break;
2162 case NT_ALT:
2163 do {
2164 r = get_max_match_length(NCAR(node), &tmax, env);
2165 if (r == 0 && *max < tmax) *max = tmax;
2166 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2167 break;
2169 case NT_STR:
2171 StrNode* sn = NSTR(node);
2172 *max = sn->end - sn->s;
2174 break;
2176 case NT_CTYPE:
2177 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2178 break;
2180 case NT_CCLASS:
2181 case NT_CANY:
2182 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2183 break;
2185 case NT_BREF:
2187 int i;
2188 int* backs;
2189 Node** nodes = SCANENV_MEM_NODES(env);
2190 BRefNode* br = NBREF(node);
2191 if (br->state & NST_RECURSION) {
2192 *max = ONIG_INFINITE_DISTANCE;
2193 break;
2195 backs = BACKREFS_P(br);
2196 for (i = 0; i < br->back_num; i++) {
2197 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2198 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2199 if (r != 0) break;
2200 if (*max < tmax) *max = tmax;
2203 break;
2205 #ifdef USE_SUBEXP_CALL
2206 case NT_CALL:
2207 if (! IS_CALL_RECURSION(NCALL(node)))
2208 r = get_max_match_length(NCALL(node)->target, max, env);
2209 else
2210 *max = ONIG_INFINITE_DISTANCE;
2211 break;
2212 #endif
2214 case NT_QTFR:
2216 QtfrNode* qn = NQTFR(node);
2218 if (qn->upper != 0) {
2219 r = get_max_match_length(qn->target, max, env);
2220 if (r == 0 && *max != 0) {
2221 if (! IS_REPEAT_INFINITE(qn->upper))
2222 *max = distance_multiply(*max, qn->upper);
2223 else
2224 *max = ONIG_INFINITE_DISTANCE;
2228 break;
2230 case NT_ENCLOSE:
2232 EncloseNode* en = NENCLOSE(node);
2233 switch (en->type) {
2234 case ENCLOSE_MEMORY:
2235 #ifdef USE_SUBEXP_CALL
2236 if (IS_ENCLOSE_MAX_FIXED(en))
2237 *max = en->max_len;
2238 else {
2239 r = get_max_match_length(en->target, max, env);
2240 if (r == 0) {
2241 en->max_len = *max;
2242 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2245 break;
2246 #endif
2247 case ENCLOSE_OPTION:
2248 case ENCLOSE_STOP_BACKTRACK:
2249 r = get_max_match_length(en->target, max, env);
2250 break;
2253 break;
2255 case NT_ANCHOR:
2256 default:
2257 break;
2260 return r;
2263 #define GET_CHAR_LEN_VARLEN -1
2264 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2266 /* fixed size pattern node only */
2267 static int
2268 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2270 int tlen;
2271 int r = 0;
2273 level++;
2274 *len = 0;
2275 switch (NTYPE(node)) {
2276 case NT_LIST:
2277 do {
2278 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2279 if (r == 0)
2280 *len = distance_add(*len, tlen);
2281 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2282 break;
2284 case NT_ALT:
2286 int tlen2;
2287 int varlen = 0;
2289 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2290 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2291 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2292 if (r == 0) {
2293 if (tlen != tlen2)
2294 varlen = 1;
2297 if (r == 0) {
2298 if (varlen != 0) {
2299 if (level == 1)
2300 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2301 else
2302 r = GET_CHAR_LEN_VARLEN;
2304 else
2305 *len = tlen;
2308 break;
2310 case NT_STR:
2312 StrNode* sn = NSTR(node);
2313 UChar *s = sn->s;
2314 while (s < sn->end) {
2315 s += enclen(reg->enc, s, sn->end);
2316 (*len)++;
2319 break;
2321 case NT_QTFR:
2323 QtfrNode* qn = NQTFR(node);
2324 if (qn->lower == qn->upper) {
2325 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2326 if (r == 0)
2327 *len = distance_multiply(tlen, qn->lower);
2329 else
2330 r = GET_CHAR_LEN_VARLEN;
2332 break;
2334 #ifdef USE_SUBEXP_CALL
2335 case NT_CALL:
2336 if (! IS_CALL_RECURSION(NCALL(node)))
2337 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2338 else
2339 r = GET_CHAR_LEN_VARLEN;
2340 break;
2341 #endif
2343 case NT_CTYPE:
2344 *len = 1;
2345 break;
2347 case NT_CCLASS:
2348 case NT_CANY:
2349 *len = 1;
2350 break;
2352 case NT_ENCLOSE:
2354 EncloseNode* en = NENCLOSE(node);
2355 switch (en->type) {
2356 case ENCLOSE_MEMORY:
2357 #ifdef USE_SUBEXP_CALL
2358 if (IS_ENCLOSE_CLEN_FIXED(en))
2359 *len = en->char_len;
2360 else {
2361 r = get_char_length_tree1(en->target, reg, len, level);
2362 if (r == 0) {
2363 en->char_len = *len;
2364 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2367 break;
2368 #endif
2369 case ENCLOSE_OPTION:
2370 case ENCLOSE_STOP_BACKTRACK:
2371 r = get_char_length_tree1(en->target, reg, len, level);
2372 break;
2373 default:
2374 break;
2377 break;
2379 case NT_ANCHOR:
2380 break;
2382 default:
2383 r = GET_CHAR_LEN_VARLEN;
2384 break;
2387 return r;
2390 static int
2391 get_char_length_tree(Node* node, regex_t* reg, int* len)
2393 return get_char_length_tree1(node, reg, len, 0);
2396 /* x is not included y ==> 1 : 0 */
2397 static int
2398 is_not_included(Node* x, Node* y, regex_t* reg)
2400 int i, len;
2401 OnigCodePoint code;
2402 UChar *p, c;
2403 int ytype;
2405 retry:
2406 ytype = NTYPE(y);
2407 switch (NTYPE(x)) {
2408 case NT_CTYPE:
2410 switch (ytype) {
2411 case NT_CTYPE:
2412 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2413 NCTYPE(y)->not != NCTYPE(x)->not)
2414 return 1;
2415 else
2416 return 0;
2417 break;
2419 case NT_CCLASS:
2420 swap:
2422 Node* tmp;
2423 tmp = x; x = y; y = tmp;
2424 goto retry;
2426 break;
2428 case NT_STR:
2429 goto swap;
2430 break;
2432 default:
2433 break;
2436 break;
2438 case NT_CCLASS:
2440 CClassNode* xc = NCCLASS(x);
2441 switch (ytype) {
2442 case NT_CTYPE:
2443 switch (NCTYPE(y)->ctype) {
2444 case ONIGENC_CTYPE_WORD:
2445 if (NCTYPE(y)->not == 0) {
2446 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2447 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2448 if (BITSET_AT(xc->bs, i)) {
2449 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2452 return 1;
2454 return 0;
2456 else {
2457 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2458 if (! IS_CODE_SB_WORD(reg->enc, i)) {
2459 if (!IS_NCCLASS_NOT(xc)) {
2460 if (BITSET_AT(xc->bs, i))
2461 return 0;
2463 else {
2464 if (! BITSET_AT(xc->bs, i))
2465 return 0;
2469 return 1;
2471 break;
2473 default:
2474 break;
2476 break;
2478 case NT_CCLASS:
2480 int v;
2481 CClassNode* yc = NCCLASS(y);
2483 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2484 v = BITSET_AT(xc->bs, i);
2485 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2486 (v == 0 && IS_NCCLASS_NOT(xc))) {
2487 v = BITSET_AT(yc->bs, i);
2488 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2489 (v == 0 && IS_NCCLASS_NOT(yc)))
2490 return 0;
2493 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2494 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2495 return 1;
2496 return 0;
2498 break;
2500 case NT_STR:
2501 goto swap;
2502 break;
2504 default:
2505 break;
2508 break;
2510 case NT_STR:
2512 StrNode* xs = NSTR(x);
2513 if (NSTRING_LEN(x) == 0)
2514 break;
2516 c = *(xs->s);
2517 switch (ytype) {
2518 case NT_CTYPE:
2519 switch (NCTYPE(y)->ctype) {
2520 case ONIGENC_CTYPE_WORD:
2521 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2522 return NCTYPE(y)->not;
2523 else
2524 return !(NCTYPE(y)->not);
2525 break;
2526 default:
2527 break;
2529 break;
2531 case NT_CCLASS:
2533 CClassNode* cc = NCCLASS(y);
2535 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2536 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2537 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2539 break;
2541 case NT_STR:
2543 UChar *q;
2544 StrNode* ys = NSTR(y);
2545 len = NSTRING_LEN(x);
2546 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2547 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2548 /* tiny version */
2549 return 0;
2551 else {
2552 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2553 if (*p != *q) return 1;
2557 break;
2559 default:
2560 break;
2563 break;
2565 default:
2566 break;
2569 return 0;
2572 static Node*
2573 get_head_value_node(Node* node, int exact, regex_t* reg)
2575 Node* n = NULL_NODE;
2577 switch (NTYPE(node)) {
2578 case NT_BREF:
2579 case NT_ALT:
2580 case NT_CANY:
2581 #ifdef USE_SUBEXP_CALL
2582 case NT_CALL:
2583 #endif
2584 break;
2586 case NT_CTYPE:
2587 case NT_CCLASS:
2588 if (exact == 0) {
2589 n = node;
2591 break;
2593 case NT_LIST:
2594 n = get_head_value_node(NCAR(node), exact, reg);
2595 break;
2597 case NT_STR:
2599 StrNode* sn = NSTR(node);
2601 if (sn->end <= sn->s)
2602 break;
2604 if (exact != 0 &&
2605 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2607 else {
2608 n = node;
2611 break;
2613 case NT_QTFR:
2615 QtfrNode* qn = NQTFR(node);
2616 if (qn->lower > 0) {
2617 if (IS_NOT_NULL(qn->head_exact))
2618 n = qn->head_exact;
2619 else
2620 n = get_head_value_node(qn->target, exact, reg);
2623 break;
2625 case NT_ENCLOSE:
2627 EncloseNode* en = NENCLOSE(node);
2628 switch (en->type) {
2629 case ENCLOSE_OPTION:
2631 OnigOptionType options = reg->options;
2633 reg->options = NENCLOSE(node)->option;
2634 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2635 reg->options = options;
2637 break;
2639 case ENCLOSE_MEMORY:
2640 case ENCLOSE_STOP_BACKTRACK:
2641 n = get_head_value_node(en->target, exact, reg);
2642 break;
2645 break;
2647 case NT_ANCHOR:
2648 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2649 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2650 break;
2652 default:
2653 break;
2656 return n;
2659 static int
2660 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2662 int type, r = 0;
2664 type = NTYPE(node);
2665 if ((NTYPE2BIT(type) & type_mask) == 0)
2666 return 1;
2668 switch (type) {
2669 case NT_LIST:
2670 case NT_ALT:
2671 do {
2672 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2673 anchor_mask);
2674 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2675 break;
2677 case NT_QTFR:
2678 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2679 anchor_mask);
2680 break;
2682 case NT_ENCLOSE:
2684 EncloseNode* en = NENCLOSE(node);
2685 if ((en->type & enclose_mask) == 0)
2686 return 1;
2688 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2690 break;
2692 case NT_ANCHOR:
2693 type = NANCHOR(node)->type;
2694 if ((type & anchor_mask) == 0)
2695 return 1;
2697 if (NANCHOR(node)->target)
2698 r = check_type_tree(NANCHOR(node)->target,
2699 type_mask, enclose_mask, anchor_mask);
2700 break;
2702 default:
2703 break;
2705 return r;
2708 #ifdef USE_SUBEXP_CALL
2710 #define RECURSION_EXIST 1
2711 #define RECURSION_INFINITE 2
2713 static int
2714 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2716 int type;
2717 int r = 0;
2719 type = NTYPE(node);
2720 switch (type) {
2721 case NT_LIST:
2723 Node *x;
2724 OnigDistance min;
2725 int ret;
2727 x = node;
2728 do {
2729 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2730 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2731 r |= ret;
2732 if (head) {
2733 ret = get_min_match_length(NCAR(x), &min, env);
2734 if (ret != 0) return ret;
2735 if (min != 0) head = 0;
2737 } while (IS_NOT_NULL(x = NCDR(x)));
2739 break;
2741 case NT_ALT:
2743 int ret;
2744 r = RECURSION_EXIST;
2745 do {
2746 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2747 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2748 r &= ret;
2749 } while (IS_NOT_NULL(node = NCDR(node)));
2751 break;
2753 case NT_QTFR:
2754 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2755 if (r == RECURSION_EXIST) {
2756 if (NQTFR(node)->lower == 0) r = 0;
2758 break;
2760 case NT_ANCHOR:
2762 AnchorNode* an = NANCHOR(node);
2763 switch (an->type) {
2764 case ANCHOR_PREC_READ:
2765 case ANCHOR_PREC_READ_NOT:
2766 case ANCHOR_LOOK_BEHIND:
2767 case ANCHOR_LOOK_BEHIND_NOT:
2768 r = subexp_inf_recursive_check(an->target, env, head);
2769 break;
2772 break;
2774 case NT_CALL:
2775 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2776 break;
2778 case NT_ENCLOSE:
2779 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2780 return 0;
2781 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2782 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2783 else {
2784 SET_ENCLOSE_STATUS(node, NST_MARK2);
2785 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2786 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2788 break;
2790 default:
2791 break;
2794 return r;
2797 static int
2798 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2800 int type;
2801 int r = 0;
2803 type = NTYPE(node);
2804 switch (type) {
2805 case NT_LIST:
2806 case NT_ALT:
2807 do {
2808 r = subexp_inf_recursive_check_trav(NCAR(node), env);
2809 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2810 break;
2812 case NT_QTFR:
2813 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2814 break;
2816 case NT_ANCHOR:
2818 AnchorNode* an = NANCHOR(node);
2819 switch (an->type) {
2820 case ANCHOR_PREC_READ:
2821 case ANCHOR_PREC_READ_NOT:
2822 case ANCHOR_LOOK_BEHIND:
2823 case ANCHOR_LOOK_BEHIND_NOT:
2824 r = subexp_inf_recursive_check_trav(an->target, env);
2825 break;
2828 break;
2830 case NT_ENCLOSE:
2832 EncloseNode* en = NENCLOSE(node);
2834 if (IS_ENCLOSE_RECURSION(en)) {
2835 SET_ENCLOSE_STATUS(node, NST_MARK1);
2836 r = subexp_inf_recursive_check(en->target, env, 1);
2837 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2838 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2840 r = subexp_inf_recursive_check_trav(en->target, env);
2843 break;
2845 default:
2846 break;
2849 return r;
2852 static int
2853 subexp_recursive_check(Node* node)
2855 int r = 0;
2857 switch (NTYPE(node)) {
2858 case NT_LIST:
2859 case NT_ALT:
2860 do {
2861 r |= subexp_recursive_check(NCAR(node));
2862 } while (IS_NOT_NULL(node = NCDR(node)));
2863 break;
2865 case NT_QTFR:
2866 r = subexp_recursive_check(NQTFR(node)->target);
2867 break;
2869 case NT_ANCHOR:
2871 AnchorNode* an = NANCHOR(node);
2872 switch (an->type) {
2873 case ANCHOR_PREC_READ:
2874 case ANCHOR_PREC_READ_NOT:
2875 case ANCHOR_LOOK_BEHIND:
2876 case ANCHOR_LOOK_BEHIND_NOT:
2877 r = subexp_recursive_check(an->target);
2878 break;
2881 break;
2883 case NT_CALL:
2884 r = subexp_recursive_check(NCALL(node)->target);
2885 if (r != 0) SET_CALL_RECURSION(node);
2886 break;
2888 case NT_ENCLOSE:
2889 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2890 return 0;
2891 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2892 return 1; /* recursion */
2893 else {
2894 SET_ENCLOSE_STATUS(node, NST_MARK2);
2895 r = subexp_recursive_check(NENCLOSE(node)->target);
2896 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2898 break;
2900 default:
2901 break;
2904 return r;
2908 static int
2909 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2911 #define FOUND_CALLED_NODE 1
2913 int type;
2914 int r = 0;
2916 type = NTYPE(node);
2917 switch (type) {
2918 case NT_LIST:
2919 case NT_ALT:
2921 int ret;
2922 do {
2923 ret = subexp_recursive_check_trav(NCAR(node), env);
2924 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2925 else if (ret < 0) return ret;
2926 } while (IS_NOT_NULL(node = NCDR(node)));
2928 break;
2930 case NT_QTFR:
2931 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2932 if (NQTFR(node)->upper == 0) {
2933 if (r == FOUND_CALLED_NODE)
2934 NQTFR(node)->is_refered = 1;
2936 break;
2938 case NT_ANCHOR:
2940 AnchorNode* an = NANCHOR(node);
2941 switch (an->type) {
2942 case ANCHOR_PREC_READ:
2943 case ANCHOR_PREC_READ_NOT:
2944 case ANCHOR_LOOK_BEHIND:
2945 case ANCHOR_LOOK_BEHIND_NOT:
2946 r = subexp_recursive_check_trav(an->target, env);
2947 break;
2950 break;
2952 case NT_ENCLOSE:
2954 EncloseNode* en = NENCLOSE(node);
2956 if (! IS_ENCLOSE_RECURSION(en)) {
2957 if (IS_ENCLOSE_CALLED(en)) {
2958 SET_ENCLOSE_STATUS(node, NST_MARK1);
2959 r = subexp_recursive_check(en->target);
2960 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
2961 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2964 r = subexp_recursive_check_trav(en->target, env);
2965 if (IS_ENCLOSE_CALLED(en))
2966 r |= FOUND_CALLED_NODE;
2968 break;
2970 default:
2971 break;
2974 return r;
2977 static int
2978 setup_subexp_call(Node* node, ScanEnv* env)
2980 int type;
2981 int r = 0;
2983 type = NTYPE(node);
2984 switch (type) {
2985 case NT_LIST:
2986 do {
2987 r = setup_subexp_call(NCAR(node), env);
2988 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2989 break;
2991 case NT_ALT:
2992 do {
2993 r = setup_subexp_call(NCAR(node), env);
2994 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2995 break;
2997 case NT_QTFR:
2998 r = setup_subexp_call(NQTFR(node)->target, env);
2999 break;
3000 case NT_ENCLOSE:
3001 r = setup_subexp_call(NENCLOSE(node)->target, env);
3002 break;
3004 case NT_CALL:
3006 CallNode* cn = NCALL(node);
3007 Node** nodes = SCANENV_MEM_NODES(env);
3009 if (cn->group_num != 0) {
3010 int gnum = cn->group_num;
3012 #ifdef USE_NAMED_GROUP
3013 if (env->num_named > 0 &&
3014 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3015 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3016 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3018 #endif
3019 if (gnum > env->num_mem) {
3020 onig_scan_env_set_error_string(env,
3021 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3022 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3025 #ifdef USE_NAMED_GROUP
3026 set_call_attr:
3027 #endif
3028 cn->target = nodes[cn->group_num];
3029 if (IS_NULL(cn->target)) {
3030 onig_scan_env_set_error_string(env,
3031 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3032 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3034 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3035 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3036 cn->unset_addr_list = env->unset_addr_list;
3038 #ifdef USE_NAMED_GROUP
3039 else {
3040 int *refs;
3042 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3043 &refs);
3044 if (n <= 0) {
3045 onig_scan_env_set_error_string(env,
3046 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3047 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3049 else if (n > 1) {
3050 onig_scan_env_set_error_string(env,
3051 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3052 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3054 else {
3055 cn->group_num = refs[0];
3056 goto set_call_attr;
3059 #endif
3061 break;
3063 case NT_ANCHOR:
3065 AnchorNode* an = NANCHOR(node);
3067 switch (an->type) {
3068 case ANCHOR_PREC_READ:
3069 case ANCHOR_PREC_READ_NOT:
3070 case ANCHOR_LOOK_BEHIND:
3071 case ANCHOR_LOOK_BEHIND_NOT:
3072 r = setup_subexp_call(an->target, env);
3073 break;
3076 break;
3078 default:
3079 break;
3082 return r;
3084 #endif
3086 /* divide different length alternatives in look-behind.
3087 (?<=A|B) ==> (?<=A)|(?<=B)
3088 (?<!A|B) ==> (?<!A)(?<!B)
3090 static int
3091 divide_look_behind_alternatives(Node* node)
3093 Node *head, *np, *insert_node;
3094 AnchorNode* an = NANCHOR(node);
3095 int anc_type = an->type;
3097 head = an->target;
3098 np = NCAR(head);
3099 swap_node(node, head);
3100 NCAR(node) = head;
3101 NANCHOR(head)->target = np;
3103 np = node;
3104 while ((np = NCDR(np)) != NULL_NODE) {
3105 insert_node = onig_node_new_anchor(anc_type);
3106 CHECK_NULL_RETURN_MEMERR(insert_node);
3107 NANCHOR(insert_node)->target = NCAR(np);
3108 NCAR(np) = insert_node;
3111 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3112 np = node;
3113 do {
3114 SET_NTYPE(np, NT_LIST); /* alt -> list */
3115 } while ((np = NCDR(np)) != NULL_NODE);
3117 return 0;
3120 static int
3121 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3123 int r, len;
3124 AnchorNode* an = NANCHOR(node);
3126 r = get_char_length_tree(an->target, reg, &len);
3127 if (r == 0)
3128 an->char_len = len;
3129 else if (r == GET_CHAR_LEN_VARLEN)
3130 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3131 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3132 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3133 r = divide_look_behind_alternatives(node);
3134 else
3135 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3138 return r;
3141 static int
3142 next_setup(Node* node, Node* next_node, regex_t* reg)
3144 int type;
3146 retry:
3147 type = NTYPE(node);
3148 if (type == NT_QTFR) {
3149 QtfrNode* qn = NQTFR(node);
3150 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3151 #ifdef USE_QTFR_PEEK_NEXT
3152 Node* n = get_head_value_node(next_node, 1, reg);
3153 /* '\0': for UTF-16BE etc... */
3154 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3155 qn->next_head_exact = n;
3157 #endif
3158 /* automatic posseivation a*b ==> (?>a*)b */
3159 if (qn->lower <= 1) {
3160 int ttype = NTYPE(qn->target);
3161 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3162 Node *x, *y;
3163 x = get_head_value_node(qn->target, 0, reg);
3164 if (IS_NOT_NULL(x)) {
3165 y = get_head_value_node(next_node, 0, reg);
3166 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3167 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3168 CHECK_NULL_RETURN_MEMERR(en);
3169 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3170 swap_node(node, en);
3171 NENCLOSE(node)->target = en;
3178 else if (type == NT_ENCLOSE) {
3179 EncloseNode* en = NENCLOSE(node);
3180 if (en->type == ENCLOSE_MEMORY) {
3181 node = en->target;
3182 goto retry;
3185 return 0;
3189 static int
3190 update_string_node_case_fold(regex_t* reg, Node *node)
3192 UChar *p, *q, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3193 UChar *sbuf, *ebuf, *sp;
3194 int r, i, len, sbuf_size;
3195 StrNode* sn = NSTR(node);
3197 end = sn->end;
3198 sbuf_size = (end - sn->s) * 2;
3199 sbuf = (UChar* )xmalloc(sbuf_size);
3200 CHECK_NULL_RETURN_MEMERR(sbuf);
3201 ebuf = sbuf + sbuf_size;
3203 sp = sbuf;
3204 p = sn->s;
3205 while (p < end) {
3206 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3207 q = buf;
3208 for (i = 0; i < len; i++) {
3209 if (sp >= ebuf) {
3210 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3211 CHECK_NULL_RETURN_MEMERR(sbuf);
3212 sp = sbuf + sbuf_size;
3213 sbuf_size *= 2;
3214 ebuf = sbuf + sbuf_size;
3217 *sp++ = buf[i];
3221 r = onig_node_str_set(node, sbuf, sp);
3222 if (r != 0) {
3223 xfree(sbuf);
3224 return r;
3227 xfree(sbuf);
3228 return 0;
3231 static int
3232 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3233 regex_t* reg)
3235 int r;
3236 Node *node;
3238 node = onig_node_new_str(s, end);
3239 if (IS_NULL(node)) return ONIGERR_MEMORY;
3241 r = update_string_node_case_fold(reg, node);
3242 if (r != 0) {
3243 onig_node_free(node);
3244 return r;
3247 NSTRING_SET_AMBIG(node);
3248 NSTRING_SET_DONT_GET_OPT_INFO(node);
3249 *rnode = node;
3250 return 0;
3253 static int
3254 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3255 UChar *p, int slen, UChar *end,
3256 regex_t* reg, Node **rnode)
3258 int r, i, j, len, varlen;
3259 Node *anode, *var_anode, *snode, *xnode, *an;
3260 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3262 *rnode = var_anode = NULL_NODE;
3264 varlen = 0;
3265 for (i = 0; i < item_num; i++) {
3266 if (items[i].byte_len != slen) {
3267 varlen = 1;
3268 break;
3272 if (varlen != 0) {
3273 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3274 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3276 xnode = onig_node_new_list(NULL, NULL);
3277 if (IS_NULL(xnode)) goto mem_err;
3278 NCAR(var_anode) = xnode;
3280 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3281 if (IS_NULL(anode)) goto mem_err;
3282 NCAR(xnode) = anode;
3284 else {
3285 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3286 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3289 snode = onig_node_new_str(p, p + slen);
3290 if (IS_NULL(snode)) goto mem_err;
3292 NCAR(anode) = snode;
3294 for (i = 0; i < item_num; i++) {
3295 snode = onig_node_new_str(NULL, NULL);
3296 if (IS_NULL(snode)) goto mem_err;
3298 for (j = 0; j < items[i].code_len; j++) {
3299 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3300 if (len < 0) {
3301 r = len;
3302 goto mem_err2;
3305 r = onig_node_str_cat(snode, buf, buf + len);
3306 if (r != 0) goto mem_err2;
3309 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3310 if (IS_NULL(an)) {
3311 goto mem_err2;
3314 if (items[i].byte_len != slen) {
3315 Node *rem;
3316 UChar *q = p + items[i].byte_len;
3318 if (q < end) {
3319 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3320 if (r != 0) {
3321 onig_node_free(an);
3322 goto mem_err2;
3325 xnode = onig_node_list_add(NULL_NODE, snode);
3326 if (IS_NULL(xnode)) {
3327 onig_node_free(an);
3328 onig_node_free(rem);
3329 goto mem_err2;
3331 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3332 onig_node_free(an);
3333 onig_node_free(xnode);
3334 onig_node_free(rem);
3335 goto mem_err;
3338 NCAR(an) = xnode;
3340 else {
3341 NCAR(an) = snode;
3344 NCDR(var_anode) = an;
3345 var_anode = an;
3347 else {
3348 NCAR(an) = snode;
3349 NCDR(anode) = an;
3350 anode = an;
3354 return varlen;
3356 mem_err2:
3357 onig_node_free(snode);
3359 mem_err:
3360 onig_node_free(*rnode);
3362 return ONIGERR_MEMORY;
3365 static int
3366 expand_case_fold_string(Node* node, regex_t* reg)
3368 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3370 int r, n, len, alt_num;
3371 UChar *start, *end, *p;
3372 Node *top_root, *root, *snode, *prev_node;
3373 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3374 StrNode* sn = NSTR(node);
3376 if (NSTRING_IS_AMBIG(node)) return 0;
3378 start = sn->s;
3379 end = sn->end;
3380 if (start >= end) return 0;
3382 r = 0;
3383 top_root = root = prev_node = snode = NULL_NODE;
3384 alt_num = 1;
3385 p = start;
3386 while (p < end) {
3387 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3388 p, end, items);
3389 if (n < 0) {
3390 r = n;
3391 goto err;
3394 len = enclen(reg->enc, p, end);
3396 if (n == 0) {
3397 if (IS_NULL(snode)) {
3398 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3399 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3400 if (IS_NULL(root)) {
3401 onig_node_free(prev_node);
3402 goto mem_err;
3406 prev_node = snode = onig_node_new_str(NULL, NULL);
3407 if (IS_NULL(snode)) goto mem_err;
3408 if (IS_NOT_NULL(root)) {
3409 if (IS_NULL(onig_node_list_add(root, snode))) {
3410 onig_node_free(snode);
3411 goto mem_err;
3416 r = onig_node_str_cat(snode, p, p + len);
3417 if (r != 0) goto err;
3419 else {
3420 alt_num *= (n + 1);
3421 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3423 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3424 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3425 if (IS_NULL(root)) {
3426 onig_node_free(prev_node);
3427 goto mem_err;
3431 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3432 if (r < 0) goto mem_err;
3433 if (r == 1) {
3434 if (IS_NULL(root)) {
3435 top_root = prev_node;
3437 else {
3438 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3439 onig_node_free(prev_node);
3440 goto mem_err;
3444 root = NCAR(prev_node);
3446 else { /* r == 0 */
3447 if (IS_NOT_NULL(root)) {
3448 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3449 onig_node_free(prev_node);
3450 goto mem_err;
3455 snode = NULL_NODE;
3458 p += len;
3461 if (p < end) {
3462 Node *srem;
3464 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3465 if (r != 0) goto mem_err;
3467 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3468 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3469 if (IS_NULL(root)) {
3470 onig_node_free(srem);
3471 onig_node_free(prev_node);
3472 goto mem_err;
3476 if (IS_NULL(root)) {
3477 prev_node = srem;
3479 else {
3480 if (IS_NULL(onig_node_list_add(root, srem))) {
3481 onig_node_free(srem);
3482 goto mem_err;
3487 /* ending */
3488 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3489 swap_node(node, top_root);
3490 onig_node_free(top_root);
3491 return 0;
3493 mem_err:
3494 r = ONIGERR_MEMORY;
3496 err:
3497 onig_node_free(top_root);
3498 return r;
3502 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3504 #define CEC_THRES_NUM_BIG_REPEAT 512
3505 #define CEC_INFINITE_NUM 0x7fffffff
3507 #define CEC_IN_INFINITE_REPEAT (1<<0)
3508 #define CEC_IN_FINITE_REPEAT (1<<1)
3509 #define CEC_CONT_BIG_REPEAT (1<<2)
3511 static int
3512 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3514 int type;
3515 int r = state;
3517 type = NTYPE(node);
3518 switch (type) {
3519 case NT_LIST:
3521 Node* prev = NULL_NODE;
3522 do {
3523 r = setup_comb_exp_check(NCAR(node), r, env);
3524 prev = NCAR(node);
3525 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3527 break;
3529 case NT_ALT:
3531 int ret;
3532 do {
3533 ret = setup_comb_exp_check(NCAR(node), state, env);
3534 r |= ret;
3535 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3537 break;
3539 case NT_QTFR:
3541 int child_state = state;
3542 int add_state = 0;
3543 QtfrNode* qn = NQTFR(node);
3544 Node* target = qn->target;
3545 int var_num;
3547 if (! IS_REPEAT_INFINITE(qn->upper)) {
3548 if (qn->upper > 1) {
3549 /* {0,1}, {1,1} are allowed */
3550 child_state |= CEC_IN_FINITE_REPEAT;
3552 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3553 if (env->backrefed_mem == 0) {
3554 if (NTYPE(qn->target) == NT_ENCLOSE) {
3555 EncloseNode* en = NENCLOSE(qn->target);
3556 if (en->type == ENCLOSE_MEMORY) {
3557 if (NTYPE(en->target) == NT_QTFR) {
3558 QtfrNode* q = NQTFR(en->target);
3559 if (IS_REPEAT_INFINITE(q->upper)
3560 && q->greedy == qn->greedy) {
3561 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3562 if (qn->upper == 1)
3563 child_state = state;
3572 if (state & CEC_IN_FINITE_REPEAT) {
3573 qn->comb_exp_check_num = -1;
3575 else {
3576 if (IS_REPEAT_INFINITE(qn->upper)) {
3577 var_num = CEC_INFINITE_NUM;
3578 child_state |= CEC_IN_INFINITE_REPEAT;
3580 else {
3581 var_num = qn->upper - qn->lower;
3584 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3585 add_state |= CEC_CONT_BIG_REPEAT;
3587 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3588 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3589 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3590 if (qn->comb_exp_check_num == 0) {
3591 env->num_comb_exp_check++;
3592 qn->comb_exp_check_num = env->num_comb_exp_check;
3593 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3594 env->comb_exp_max_regnum = env->curr_max_regnum;
3599 r = setup_comb_exp_check(target, child_state, env);
3600 r |= add_state;
3602 break;
3604 case NT_ENCLOSE:
3606 EncloseNode* en = NENCLOSE(node);
3608 switch (en->type) {
3609 case ENCLOSE_MEMORY:
3611 if (env->curr_max_regnum < en->regnum)
3612 env->curr_max_regnum = en->regnum;
3614 r = setup_comb_exp_check(en->target, state, env);
3616 break;
3618 default:
3619 r = setup_comb_exp_check(en->target, state, env);
3620 break;
3623 break;
3625 #ifdef USE_SUBEXP_CALL
3626 case NT_CALL:
3627 if (IS_CALL_RECURSION(NCALL(node)))
3628 env->has_recursion = 1;
3629 else
3630 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3631 break;
3632 #endif
3634 default:
3635 break;
3638 return r;
3640 #endif
3642 #define IN_ALT (1<<0)
3643 #define IN_NOT (1<<1)
3644 #define IN_REPEAT (1<<2)
3645 #define IN_VAR_REPEAT (1<<3)
3647 /* setup_tree does the following work.
3648 1. check empty loop. (set qn->target_empty_info)
3649 2. expand ignore-case in char class.
3650 3. set memory status bit flags. (reg->mem_stats)
3651 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3652 5. find invalid patterns in look-behind.
3653 6. expand repeated string.
3655 static int
3656 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3658 int type;
3659 int r = 0;
3661 type = NTYPE(node);
3662 switch (type) {
3663 case NT_LIST:
3665 Node* prev = NULL_NODE;
3666 do {
3667 r = setup_tree(NCAR(node), reg, state, env);
3668 if (IS_NOT_NULL(prev) && r == 0) {
3669 r = next_setup(prev, NCAR(node), reg);
3671 prev = NCAR(node);
3672 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3674 break;
3676 case NT_ALT:
3677 do {
3678 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3679 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3680 break;
3682 case NT_CCLASS:
3683 break;
3685 case NT_STR:
3686 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3687 r = expand_case_fold_string(node, reg);
3689 break;
3691 case NT_CTYPE:
3692 case NT_CANY:
3693 break;
3695 #ifdef USE_SUBEXP_CALL
3696 case NT_CALL:
3697 break;
3698 #endif
3700 case NT_BREF:
3702 int i;
3703 int* p;
3704 Node** nodes = SCANENV_MEM_NODES(env);
3705 BRefNode* br = NBREF(node);
3706 p = BACKREFS_P(br);
3707 for (i = 0; i < br->back_num; i++) {
3708 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3709 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3710 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3711 #ifdef USE_BACKREF_WITH_LEVEL
3712 if (IS_BACKREF_NEST_LEVEL(br)) {
3713 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3715 #endif
3716 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3719 break;
3721 case NT_QTFR:
3723 OnigDistance d;
3724 QtfrNode* qn = NQTFR(node);
3725 Node* target = qn->target;
3727 if ((state & IN_REPEAT) != 0) {
3728 qn->state |= NST_IN_REPEAT;
3731 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3732 r = get_min_match_length(target, &d, env);
3733 if (r) break;
3734 if (d == 0) {
3735 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3736 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3737 r = quantifiers_memory_node_info(target);
3738 if (r < 0) break;
3739 if (r > 0) {
3740 qn->target_empty_info = r;
3742 #endif
3743 #if 0
3744 r = get_max_match_length(target, &d, env);
3745 if (r == 0 && d == 0) {
3746 /* ()* ==> ()?, ()+ ==> () */
3747 qn->upper = 1;
3748 if (qn->lower > 1) qn->lower = 1;
3749 if (NTYPE(target) == NT_STR) {
3750 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3753 #endif
3757 state |= IN_REPEAT;
3758 if (qn->lower != qn->upper)
3759 state |= IN_VAR_REPEAT;
3760 r = setup_tree(target, reg, state, env);
3761 if (r) break;
3763 /* expand string */
3764 #define EXPAND_STRING_MAX_LENGTH 100
3765 if (NTYPE(target) == NT_STR) {
3766 if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3767 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3768 int len = NSTRING_LEN(target);
3769 StrNode* sn = NSTR(target);
3771 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3772 int i, n = qn->lower;
3773 onig_node_conv_to_str_node(node, NSTR(target)->flag);
3774 for (i = 0; i < n; i++) {
3775 r = onig_node_str_cat(node, sn->s, sn->end);
3776 if (r) break;
3778 onig_node_free(target);
3779 break; /* break case NT_QTFR: */
3784 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3785 if (qn->greedy && (qn->target_empty_info != 0)) {
3786 if (NTYPE(target) == NT_QTFR) {
3787 QtfrNode* tqn = NQTFR(target);
3788 if (IS_NOT_NULL(tqn->head_exact)) {
3789 qn->head_exact = tqn->head_exact;
3790 tqn->head_exact = NULL;
3793 else {
3794 qn->head_exact = get_head_value_node(qn->target, 1, reg);
3797 #endif
3799 break;
3801 case NT_ENCLOSE:
3803 EncloseNode* en = NENCLOSE(node);
3805 switch (en->type) {
3806 case ENCLOSE_OPTION:
3808 OnigOptionType options = reg->options;
3809 reg->options = NENCLOSE(node)->option;
3810 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3811 reg->options = options;
3813 break;
3815 case ENCLOSE_MEMORY:
3816 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3817 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3818 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3820 r = setup_tree(en->target, reg, state, env);
3821 break;
3823 case ENCLOSE_STOP_BACKTRACK:
3825 Node* target = en->target;
3826 r = setup_tree(target, reg, state, env);
3827 if (NTYPE(target) == NT_QTFR) {
3828 QtfrNode* tqn = NQTFR(target);
3829 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3830 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3831 int qtype = NTYPE(tqn->target);
3832 if (IS_NODE_TYPE_SIMPLE(qtype))
3833 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3837 break;
3840 break;
3842 case NT_ANCHOR:
3844 AnchorNode* an = NANCHOR(node);
3846 switch (an->type) {
3847 case ANCHOR_PREC_READ:
3848 r = setup_tree(an->target, reg, state, env);
3849 break;
3850 case ANCHOR_PREC_READ_NOT:
3851 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3852 break;
3854 /* allowed node types in look-behind */
3855 #define ALLOWED_TYPE_IN_LB \
3856 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3857 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3859 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY )
3860 #define ALLOWED_ENCLOSE_IN_LB_NOT 0
3862 #define ALLOWED_ANCHOR_IN_LB \
3863 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3864 #define ALLOWED_ANCHOR_IN_LB_NOT \
3865 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3867 case ANCHOR_LOOK_BEHIND:
3869 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3870 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3871 if (r < 0) return r;
3872 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3873 r = setup_look_behind(node, reg, env);
3874 if (r != 0) return r;
3875 r = setup_tree(an->target, reg, state, env);
3877 break;
3879 case ANCHOR_LOOK_BEHIND_NOT:
3881 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3882 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3883 if (r < 0) return r;
3884 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3885 r = setup_look_behind(node, reg, env);
3886 if (r != 0) return r;
3887 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3889 break;
3892 break;
3894 default:
3895 break;
3898 return r;
3901 /* set skip map for Boyer-Moor search */
3902 static int
3903 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3904 UChar skip[], int** int_skip)
3906 int i, len;
3908 len = end - s;
3909 if (len < ONIG_CHAR_TABLE_SIZE) {
3910 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3912 for (i = 0; i < len - 1; i++)
3913 skip[s[i]] = len - 1 - i;
3915 else {
3916 if (IS_NULL(*int_skip)) {
3917 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3918 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3920 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3922 for (i = 0; i < len - 1; i++)
3923 (*int_skip)[s[i]] = len - 1 - i;
3925 return 0;
3928 #define OPT_EXACT_MAXLEN 24
3930 typedef struct {
3931 OnigDistance min; /* min byte length */
3932 OnigDistance max; /* max byte length */
3933 } MinMaxLen;
3935 typedef struct {
3936 MinMaxLen mmd;
3937 OnigEncoding enc;
3938 OnigOptionType options;
3939 OnigCaseFoldType case_fold_flag;
3940 ScanEnv* scan_env;
3941 } OptEnv;
3943 typedef struct {
3944 int left_anchor;
3945 int right_anchor;
3946 } OptAncInfo;
3948 typedef struct {
3949 MinMaxLen mmd; /* info position */
3950 OptAncInfo anc;
3952 int reach_end;
3953 int ignore_case;
3954 int len;
3955 UChar s[OPT_EXACT_MAXLEN];
3956 } OptExactInfo;
3958 typedef struct {
3959 MinMaxLen mmd; /* info position */
3960 OptAncInfo anc;
3962 int value; /* weighted value */
3963 UChar map[ONIG_CHAR_TABLE_SIZE];
3964 } OptMapInfo;
3966 typedef struct {
3967 MinMaxLen len;
3969 OptAncInfo anc;
3970 OptExactInfo exb; /* boundary */
3971 OptExactInfo exm; /* middle */
3972 OptExactInfo expr; /* prec read (?=...) */
3974 OptMapInfo map; /* boundary */
3975 } NodeOptInfo;
3978 static int
3979 map_position_value(OnigEncoding enc, int i)
3981 static const short int ByteValTable[] = {
3982 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
3983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3984 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
3985 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
3986 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3987 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
3988 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3989 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
3992 if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
3993 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
3994 return 20;
3995 else
3996 return (int )ByteValTable[i];
3998 else
3999 return 4; /* Take it easy. */
4002 static int
4003 distance_value(MinMaxLen* mm)
4005 /* 1000 / (min-max-dist + 1) */
4006 static const short int dist_vals[] = {
4007 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4008 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4009 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4010 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4011 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4012 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4013 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4014 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4015 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4016 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4019 int d;
4021 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4023 d = mm->max - mm->min;
4024 if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4025 /* return dist_vals[d] * 16 / (mm->min + 12); */
4026 return (int )dist_vals[d];
4027 else
4028 return 1;
4031 static int
4032 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4034 if (v2 <= 0) return -1;
4035 if (v1 <= 0) return 1;
4037 v1 *= distance_value(d1);
4038 v2 *= distance_value(d2);
4040 if (v2 > v1) return 1;
4041 if (v2 < v1) return -1;
4043 if (d2->min < d1->min) return 1;
4044 if (d2->min > d1->min) return -1;
4045 return 0;
4048 static int
4049 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4051 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4055 static void
4056 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4058 mml->min = min;
4059 mml->max = max;
4062 static void
4063 clear_mml(MinMaxLen* mml)
4065 mml->min = mml->max = 0;
4068 static void
4069 copy_mml(MinMaxLen* to, MinMaxLen* from)
4071 to->min = from->min;
4072 to->max = from->max;
4075 static void
4076 add_mml(MinMaxLen* to, MinMaxLen* from)
4078 to->min = distance_add(to->min, from->min);
4079 to->max = distance_add(to->max, from->max);
4082 #if 0
4083 static void
4084 add_len_mml(MinMaxLen* to, OnigDistance len)
4086 to->min = distance_add(to->min, len);
4087 to->max = distance_add(to->max, len);
4089 #endif
4091 static void
4092 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4094 if (to->min > from->min) to->min = from->min;
4095 if (to->max < from->max) to->max = from->max;
4098 static void
4099 copy_opt_env(OptEnv* to, OptEnv* from)
4101 *to = *from;
4104 static void
4105 clear_opt_anc_info(OptAncInfo* anc)
4107 anc->left_anchor = 0;
4108 anc->right_anchor = 0;
4111 static void
4112 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4114 *to = *from;
4117 static void
4118 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4119 OnigDistance left_len, OnigDistance right_len)
4121 clear_opt_anc_info(to);
4123 to->left_anchor = left->left_anchor;
4124 if (left_len == 0) {
4125 to->left_anchor |= right->left_anchor;
4128 to->right_anchor = right->right_anchor;
4129 if (right_len == 0) {
4130 to->right_anchor |= left->right_anchor;
4134 static int
4135 is_left_anchor(int anc)
4137 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4138 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4139 anc == ANCHOR_PREC_READ_NOT)
4140 return 0;
4142 return 1;
4145 static int
4146 is_set_opt_anc_info(OptAncInfo* to, int anc)
4148 if ((to->left_anchor & anc) != 0) return 1;
4150 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4153 static void
4154 add_opt_anc_info(OptAncInfo* to, int anc)
4156 if (is_left_anchor(anc))
4157 to->left_anchor |= anc;
4158 else
4159 to->right_anchor |= anc;
4162 static void
4163 remove_opt_anc_info(OptAncInfo* to, int anc)
4165 if (is_left_anchor(anc))
4166 to->left_anchor &= ~anc;
4167 else
4168 to->right_anchor &= ~anc;
4171 static void
4172 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4174 to->left_anchor &= add->left_anchor;
4175 to->right_anchor &= add->right_anchor;
4178 static int
4179 is_full_opt_exact_info(OptExactInfo* ex)
4181 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4184 static void
4185 clear_opt_exact_info(OptExactInfo* ex)
4187 clear_mml(&ex->mmd);
4188 clear_opt_anc_info(&ex->anc);
4189 ex->reach_end = 0;
4190 ex->ignore_case = 0;
4191 ex->len = 0;
4192 ex->s[0] = '\0';
4195 static void
4196 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4198 *to = *from;
4201 static void
4202 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4204 int i, j, len;
4205 UChar *p, *end;
4206 OptAncInfo tanc;
4208 if (! to->ignore_case && add->ignore_case) {
4209 if (to->len >= add->len) return ; /* avoid */
4211 to->ignore_case = 1;
4214 p = add->s;
4215 end = p + add->len;
4216 for (i = to->len; p < end; ) {
4217 len = enclen(enc, p, end);
4218 if (i + len > OPT_EXACT_MAXLEN) break;
4219 for (j = 0; j < len && p < end; j++)
4220 to->s[i++] = *p++;
4223 to->len = i;
4224 to->reach_end = (p == end ? add->reach_end : 0);
4226 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4227 if (! to->reach_end) tanc.right_anchor = 0;
4228 copy_opt_anc_info(&to->anc, &tanc);
4231 static void
4232 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4233 int raw ARG_UNUSED, OnigEncoding enc)
4235 int i, j, len;
4236 UChar *p;
4238 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4239 len = enclen(enc, p, end);
4240 if (i + len > OPT_EXACT_MAXLEN) break;
4241 for (j = 0; j < len && p < end; j++)
4242 to->s[i++] = *p++;
4245 to->len = i;
4248 static void
4249 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4251 int i, j, len;
4253 if (add->len == 0 || to->len == 0) {
4254 clear_opt_exact_info(to);
4255 return ;
4258 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4259 clear_opt_exact_info(to);
4260 return ;
4263 for (i = 0; i < to->len && i < add->len; ) {
4264 if (to->s[i] != add->s[i]) break;
4265 len = enclen(env->enc, to->s + i, to->s + to->len);
4267 for (j = 1; j < len; j++) {
4268 if (to->s[i+j] != add->s[i+j]) break;
4270 if (j < len) break;
4271 i += len;
4274 if (! add->reach_end || i < add->len || i < to->len) {
4275 to->reach_end = 0;
4277 to->len = i;
4278 to->ignore_case |= add->ignore_case;
4280 alt_merge_opt_anc_info(&to->anc, &add->anc);
4281 if (! to->reach_end) to->anc.right_anchor = 0;
4284 static void
4285 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4287 int v1, v2;
4289 v1 = now->len;
4290 v2 = alt->len;
4292 if (v2 == 0) {
4293 return ;
4295 else if (v1 == 0) {
4296 copy_opt_exact_info(now, alt);
4297 return ;
4299 else if (v1 <= 2 && v2 <= 2) {
4300 /* ByteValTable[x] is big value --> low price */
4301 v2 = map_position_value(enc, now->s[0]);
4302 v1 = map_position_value(enc, alt->s[0]);
4304 if (now->len > 1) v1 += 5;
4305 if (alt->len > 1) v2 += 5;
4308 if (now->ignore_case == 0) v1 *= 2;
4309 if (alt->ignore_case == 0) v2 *= 2;
4311 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4312 copy_opt_exact_info(now, alt);
4315 static void
4316 clear_opt_map_info(OptMapInfo* map)
4318 static const OptMapInfo clean_info = {
4319 {0, 0}, {0, 0}, 0,
4321 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4336 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4340 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4343 static void
4344 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4346 *to = *from;
4349 static void
4350 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4352 if (map->map[c] == 0) {
4353 map->map[c] = 1;
4354 map->value += map_position_value(enc, c);
4358 static int
4359 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4360 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4362 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4363 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4364 int i, n;
4366 add_char_opt_map_info(map, p[0], enc);
4368 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4369 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4370 if (n < 0) return n;
4372 for (i = 0; i < n; i++) {
4373 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4374 add_char_opt_map_info(map, buf[0], enc);
4377 return 0;
4380 static void
4381 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4383 const int z = 1<<15; /* 32768: something big value */
4385 int v1, v2;
4387 if (alt->value == 0) return ;
4388 if (now->value == 0) {
4389 copy_opt_map_info(now, alt);
4390 return ;
4393 v1 = z / now->value;
4394 v2 = z / alt->value;
4395 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4396 copy_opt_map_info(now, alt);
4399 static int
4400 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4402 #define COMP_EM_BASE 20
4403 int ve, vm;
4405 if (m->value <= 0) return -1;
4407 ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4408 vm = COMP_EM_BASE * 5 * 2 / m->value;
4409 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4412 static void
4413 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4415 int i, val;
4417 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4418 if (to->value == 0) return ;
4419 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4420 clear_opt_map_info(to);
4421 return ;
4424 alt_merge_mml(&to->mmd, &add->mmd);
4426 val = 0;
4427 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4428 if (add->map[i])
4429 to->map[i] = 1;
4431 if (to->map[i])
4432 val += map_position_value(enc, i);
4434 to->value = val;
4436 alt_merge_opt_anc_info(&to->anc, &add->anc);
4439 static void
4440 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4442 copy_mml(&(opt->exb.mmd), mmd);
4443 copy_mml(&(opt->expr.mmd), mmd);
4444 copy_mml(&(opt->map.mmd), mmd);
4447 static void
4448 clear_node_opt_info(NodeOptInfo* opt)
4450 clear_mml(&opt->len);
4451 clear_opt_anc_info(&opt->anc);
4452 clear_opt_exact_info(&opt->exb);
4453 clear_opt_exact_info(&opt->exm);
4454 clear_opt_exact_info(&opt->expr);
4455 clear_opt_map_info(&opt->map);
4458 static void
4459 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4461 *to = *from;
4464 static void
4465 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4467 int exb_reach, exm_reach;
4468 OptAncInfo tanc;
4470 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4471 copy_opt_anc_info(&to->anc, &tanc);
4473 if (add->exb.len > 0 && to->len.max == 0) {
4474 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4475 to->len.max, add->len.max);
4476 copy_opt_anc_info(&add->exb.anc, &tanc);
4479 if (add->map.value > 0 && to->len.max == 0) {
4480 if (add->map.mmd.max == 0)
4481 add->map.anc.left_anchor |= to->anc.left_anchor;
4484 exb_reach = to->exb.reach_end;
4485 exm_reach = to->exm.reach_end;
4487 if (add->len.max != 0)
4488 to->exb.reach_end = to->exm.reach_end = 0;
4490 if (add->exb.len > 0) {
4491 if (exb_reach) {
4492 concat_opt_exact_info(&to->exb, &add->exb, enc);
4493 clear_opt_exact_info(&add->exb);
4495 else if (exm_reach) {
4496 concat_opt_exact_info(&to->exm, &add->exb, enc);
4497 clear_opt_exact_info(&add->exb);
4500 select_opt_exact_info(enc, &to->exm, &add->exb);
4501 select_opt_exact_info(enc, &to->exm, &add->exm);
4503 if (to->expr.len > 0) {
4504 if (add->len.max > 0) {
4505 if (to->expr.len > (int )add->len.max)
4506 to->expr.len = add->len.max;
4508 if (to->expr.mmd.max == 0)
4509 select_opt_exact_info(enc, &to->exb, &to->expr);
4510 else
4511 select_opt_exact_info(enc, &to->exm, &to->expr);
4514 else if (add->expr.len > 0) {
4515 copy_opt_exact_info(&to->expr, &add->expr);
4518 select_opt_map_info(&to->map, &add->map);
4520 add_mml(&to->len, &add->len);
4523 static void
4524 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4526 alt_merge_opt_anc_info (&to->anc, &add->anc);
4527 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4528 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4529 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4530 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4532 alt_merge_mml(&to->len, &add->len);
4536 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4538 static int
4539 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4541 int type;
4542 int r = 0;
4544 clear_node_opt_info(opt);
4545 set_bound_node_opt_info(opt, &env->mmd);
4547 type = NTYPE(node);
4548 switch (type) {
4549 case NT_LIST:
4551 OptEnv nenv;
4552 NodeOptInfo nopt;
4553 Node* nd = node;
4555 copy_opt_env(&nenv, env);
4556 do {
4557 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4558 if (r == 0) {
4559 add_mml(&nenv.mmd, &nopt.len);
4560 concat_left_node_opt_info(env->enc, opt, &nopt);
4562 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4564 break;
4566 case NT_ALT:
4568 NodeOptInfo nopt;
4569 Node* nd = node;
4571 do {
4572 r = optimize_node_left(NCAR(nd), &nopt, env);
4573 if (r == 0) {
4574 if (nd == node) copy_node_opt_info(opt, &nopt);
4575 else alt_merge_node_opt_info(opt, &nopt, env);
4577 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4579 break;
4581 case NT_STR:
4583 StrNode* sn = NSTR(node);
4584 int slen = sn->end - sn->s;
4585 int is_raw = NSTRING_IS_RAW(node);
4587 if (! NSTRING_IS_AMBIG(node)) {
4588 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4589 NSTRING_IS_RAW(node), env->enc);
4590 if (slen > 0) {
4591 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4593 set_mml(&opt->len, slen, slen);
4595 else {
4596 int max;
4598 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4599 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4600 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4602 else {
4603 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4604 is_raw, env->enc);
4605 opt->exb.ignore_case = 1;
4607 if (slen > 0) {
4608 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4609 env->enc, env->case_fold_flag);
4610 if (r != 0) break;
4613 max = slen;
4616 set_mml(&opt->len, slen, max);
4619 if (opt->exb.len == slen)
4620 opt->exb.reach_end = 1;
4622 break;
4624 case NT_CCLASS:
4626 int i, z;
4627 CClassNode* cc = NCCLASS(node);
4629 /* no need to check ignore case. (setted in setup_tree()) */
4631 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4632 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4633 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4635 set_mml(&opt->len, min, max);
4637 else {
4638 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4639 z = BITSET_AT(cc->bs, i);
4640 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4641 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4644 set_mml(&opt->len, 1, 1);
4647 break;
4649 case NT_CTYPE:
4651 int i, min, max;
4653 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4655 if (max == 1) {
4656 min = 1;
4658 switch (NCTYPE(node)->ctype) {
4659 case ONIGENC_CTYPE_WORD:
4660 if (NCTYPE(node)->not != 0) {
4661 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4662 if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4663 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4667 else {
4668 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4669 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4670 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4674 break;
4677 else {
4678 min = ONIGENC_MBC_MINLEN(env->enc);
4680 set_mml(&opt->len, min, max);
4682 break;
4684 case NT_CANY:
4686 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4687 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4688 set_mml(&opt->len, min, max);
4690 break;
4692 case NT_ANCHOR:
4693 switch (NANCHOR(node)->type) {
4694 case ANCHOR_BEGIN_BUF:
4695 case ANCHOR_BEGIN_POSITION:
4696 case ANCHOR_BEGIN_LINE:
4697 case ANCHOR_END_BUF:
4698 case ANCHOR_SEMI_END_BUF:
4699 case ANCHOR_END_LINE:
4700 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4701 break;
4703 case ANCHOR_PREC_READ:
4705 NodeOptInfo nopt;
4707 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4708 if (r == 0) {
4709 if (nopt.exb.len > 0)
4710 copy_opt_exact_info(&opt->expr, &nopt.exb);
4711 else if (nopt.exm.len > 0)
4712 copy_opt_exact_info(&opt->expr, &nopt.exm);
4714 opt->expr.reach_end = 0;
4716 if (nopt.map.value > 0)
4717 copy_opt_map_info(&opt->map, &nopt.map);
4720 break;
4722 case ANCHOR_PREC_READ_NOT:
4723 case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4724 case ANCHOR_LOOK_BEHIND_NOT:
4725 break;
4727 break;
4729 case NT_BREF:
4731 int i;
4732 int* backs;
4733 OnigDistance min, max, tmin, tmax;
4734 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4735 BRefNode* br = NBREF(node);
4737 if (br->state & NST_RECURSION) {
4738 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4739 break;
4741 backs = BACKREFS_P(br);
4742 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4743 if (r != 0) break;
4744 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4745 if (r != 0) break;
4746 for (i = 1; i < br->back_num; i++) {
4747 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4748 if (r != 0) break;
4749 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4750 if (r != 0) break;
4751 if (min > tmin) min = tmin;
4752 if (max < tmax) max = tmax;
4754 if (r == 0) set_mml(&opt->len, min, max);
4756 break;
4758 #ifdef USE_SUBEXP_CALL
4759 case NT_CALL:
4760 if (IS_CALL_RECURSION(NCALL(node)))
4761 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4762 else {
4763 OnigOptionType save = env->options;
4764 env->options = NENCLOSE(NCALL(node)->target)->option;
4765 r = optimize_node_left(NCALL(node)->target, opt, env);
4766 env->options = save;
4768 break;
4769 #endif
4771 case NT_QTFR:
4773 int i;
4774 OnigDistance min, max;
4775 NodeOptInfo nopt;
4776 QtfrNode* qn = NQTFR(node);
4778 r = optimize_node_left(qn->target, &nopt, env);
4779 if (r) break;
4781 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4782 if (env->mmd.max == 0 &&
4783 NTYPE(qn->target) == NT_CANY && qn->greedy) {
4784 if (IS_MULTILINE(env->options))
4785 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4786 else
4787 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4790 else {
4791 if (qn->lower > 0) {
4792 copy_node_opt_info(opt, &nopt);
4793 if (nopt.exb.len > 0) {
4794 if (nopt.exb.reach_end) {
4795 for (i = 2; i < qn->lower &&
4796 ! is_full_opt_exact_info(&opt->exb); i++) {
4797 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4799 if (i < qn->lower) {
4800 opt->exb.reach_end = 0;
4805 if (qn->lower != qn->upper) {
4806 opt->exb.reach_end = 0;
4807 opt->exm.reach_end = 0;
4809 if (qn->lower > 1)
4810 opt->exm.reach_end = 0;
4814 min = distance_multiply(nopt.len.min, qn->lower);
4815 if (IS_REPEAT_INFINITE(qn->upper))
4816 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4817 else
4818 max = distance_multiply(nopt.len.max, qn->upper);
4820 set_mml(&opt->len, min, max);
4822 break;
4824 case NT_ENCLOSE:
4826 EncloseNode* en = NENCLOSE(node);
4828 switch (en->type) {
4829 case ENCLOSE_OPTION:
4831 OnigOptionType save = env->options;
4833 env->options = en->option;
4834 r = optimize_node_left(en->target, opt, env);
4835 env->options = save;
4837 break;
4839 case ENCLOSE_MEMORY:
4840 #ifdef USE_SUBEXP_CALL
4841 en->opt_count++;
4842 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4843 OnigDistance min, max;
4845 min = 0;
4846 max = ONIG_INFINITE_DISTANCE;
4847 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4848 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4849 set_mml(&opt->len, min, max);
4851 else
4852 #endif
4854 r = optimize_node_left(en->target, opt, env);
4856 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4857 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4858 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4861 break;
4863 case ENCLOSE_STOP_BACKTRACK:
4864 r = optimize_node_left(en->target, opt, env);
4865 break;
4868 break;
4870 default:
4871 #ifdef ONIG_DEBUG
4872 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4873 NTYPE(node));
4874 #endif
4875 r = ONIGERR_TYPE_BUG;
4876 break;
4879 return r;
4882 static int
4883 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4885 int r;
4887 if (e->len == 0) return 0;
4889 if (e->ignore_case) {
4890 reg->exact = (UChar* )xmalloc(e->len);
4891 CHECK_NULL_RETURN_MEMERR(reg->exact);
4892 xmemcpy(reg->exact, e->s, e->len);
4893 reg->exact_end = reg->exact + e->len;
4894 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4896 else {
4897 int allow_reverse;
4899 reg->exact = str_dup(e->s, e->s + e->len);
4900 CHECK_NULL_RETURN_MEMERR(reg->exact);
4901 reg->exact_end = reg->exact + e->len;
4903 allow_reverse =
4904 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4906 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4907 r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4908 reg->map, &(reg->int_map));
4909 if (r) return r;
4911 reg->optimize = (allow_reverse != 0
4912 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4914 else {
4915 reg->optimize = ONIG_OPTIMIZE_EXACT;
4919 reg->dmin = e->mmd.min;
4920 reg->dmax = e->mmd.max;
4922 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4923 reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4926 return 0;
4929 static void
4930 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4932 int i;
4934 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4935 reg->map[i] = m->map[i];
4937 reg->optimize = ONIG_OPTIMIZE_MAP;
4938 reg->dmin = m->mmd.min;
4939 reg->dmax = m->mmd.max;
4941 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4942 reg->threshold_len = reg->dmin + 1;
4946 static void
4947 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4949 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
4950 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4953 #ifdef ONIG_DEBUG
4954 static void print_optimize_info(FILE* f, regex_t* reg);
4955 #endif
4957 static int
4958 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4961 int r;
4962 NodeOptInfo opt;
4963 OptEnv env;
4965 env.enc = reg->enc;
4966 env.options = reg->options;
4967 env.case_fold_flag = reg->case_fold_flag;
4968 env.scan_env = scan_env;
4969 clear_mml(&env.mmd);
4971 r = optimize_node_left(node, &opt, &env);
4972 if (r) return r;
4974 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4975 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4977 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4979 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
4980 reg->anchor_dmin = opt.len.min;
4981 reg->anchor_dmax = opt.len.max;
4984 if (opt.exb.len > 0 || opt.exm.len > 0) {
4985 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
4986 if (opt.map.value > 0 &&
4987 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
4988 goto set_map;
4990 else {
4991 r = set_optimize_exact_info(reg, &opt.exb);
4992 set_sub_anchor(reg, &opt.exb.anc);
4995 else if (opt.map.value > 0) {
4996 set_map:
4997 set_optimize_map_info(reg, &opt.map);
4998 set_sub_anchor(reg, &opt.map.anc);
5000 else {
5001 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5002 if (opt.len.max == 0)
5003 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5006 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5007 print_optimize_info(stderr, reg);
5008 #endif
5009 return r;
5012 static void
5013 clear_optimize_info(regex_t* reg)
5015 reg->optimize = ONIG_OPTIMIZE_NONE;
5016 reg->anchor = 0;
5017 reg->anchor_dmin = 0;
5018 reg->anchor_dmax = 0;
5019 reg->sub_anchor = 0;
5020 reg->exact_end = (UChar* )NULL;
5021 reg->threshold_len = 0;
5022 if (IS_NOT_NULL(reg->exact)) {
5023 xfree(reg->exact);
5024 reg->exact = (UChar* )NULL;
5028 #ifdef ONIG_DEBUG
5030 static void print_enc_string(FILE* fp, OnigEncoding enc,
5031 const UChar *s, const UChar *end)
5033 fprintf(fp, "\nPATTERN: /");
5035 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5036 const UChar *p;
5037 OnigCodePoint code;
5039 p = s;
5040 while (p < end) {
5041 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5042 if (code >= 0x80) {
5043 fprintf(fp, " 0x%04x ", (int )code);
5045 else {
5046 fputc((int )code, fp);
5049 p += enclen(enc, p);
5052 else {
5053 while (s < end) {
5054 fputc((int )*s, fp);
5055 s++;
5059 fprintf(fp, "/\n");
5062 static void
5063 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5065 if (a == ONIG_INFINITE_DISTANCE)
5066 fputs("inf", f);
5067 else
5068 fprintf(f, "(%u)", a);
5070 fputs("-", f);
5072 if (b == ONIG_INFINITE_DISTANCE)
5073 fputs("inf", f);
5074 else
5075 fprintf(f, "(%u)", b);
5078 static void
5079 print_anchor(FILE* f, int anchor)
5081 int q = 0;
5083 fprintf(f, "[");
5085 if (anchor & ANCHOR_BEGIN_BUF) {
5086 fprintf(f, "begin-buf");
5087 q = 1;
5089 if (anchor & ANCHOR_BEGIN_LINE) {
5090 if (q) fprintf(f, ", ");
5091 q = 1;
5092 fprintf(f, "begin-line");
5094 if (anchor & ANCHOR_BEGIN_POSITION) {
5095 if (q) fprintf(f, ", ");
5096 q = 1;
5097 fprintf(f, "begin-pos");
5099 if (anchor & ANCHOR_END_BUF) {
5100 if (q) fprintf(f, ", ");
5101 q = 1;
5102 fprintf(f, "end-buf");
5104 if (anchor & ANCHOR_SEMI_END_BUF) {
5105 if (q) fprintf(f, ", ");
5106 q = 1;
5107 fprintf(f, "semi-end-buf");
5109 if (anchor & ANCHOR_END_LINE) {
5110 if (q) fprintf(f, ", ");
5111 q = 1;
5112 fprintf(f, "end-line");
5114 if (anchor & ANCHOR_ANYCHAR_STAR) {
5115 if (q) fprintf(f, ", ");
5116 q = 1;
5117 fprintf(f, "anychar-star");
5119 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5120 if (q) fprintf(f, ", ");
5121 fprintf(f, "anychar-star-pl");
5124 fprintf(f, "]");
5127 static void
5128 print_optimize_info(FILE* f, regex_t* reg)
5130 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5131 "EXACT_IC", "MAP" };
5133 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5134 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5135 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5136 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5137 fprintf(f, "\n");
5139 if (reg->optimize) {
5140 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5141 fprintf(f, "\n");
5143 fprintf(f, "\n");
5145 if (reg->exact) {
5146 UChar *p;
5147 fprintf(f, "exact: [");
5148 for (p = reg->exact; p < reg->exact_end; p++) {
5149 fputc(*p, f);
5151 fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
5153 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5154 int c, i, n = 0;
5156 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5157 if (reg->map[i]) n++;
5159 fprintf(f, "map: n=%d\n", n);
5160 if (n > 0) {
5161 c = 0;
5162 fputc('[', f);
5163 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5164 if (reg->map[i] != 0) {
5165 if (c > 0) fputs(", ", f);
5166 c++;
5167 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5168 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5169 fputc(i, f);
5170 else
5171 fprintf(f, "%d", i);
5174 fprintf(f, "]\n");
5178 #endif /* ONIG_DEBUG */
5181 static void
5182 onig_free_body(regex_t* reg)
5184 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
5185 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
5186 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
5187 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5188 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
5189 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
5191 #ifdef USE_NAMED_GROUP
5192 onig_names_free(reg);
5193 #endif
5196 extern void
5197 onig_free(regex_t* reg)
5199 if (IS_NOT_NULL(reg)) {
5200 onig_free_body(reg);
5201 xfree(reg);
5205 #define REGEX_TRANSFER(to,from) do {\
5206 (to)->state = ONIG_STATE_MODIFY;\
5207 onig_free_body(to);\
5208 xmemcpy(to, from, sizeof(regex_t));\
5209 xfree(from);\
5210 } while (0)
5212 extern void
5213 onig_transfer(regex_t* to, regex_t* from)
5215 THREAD_ATOMIC_START;
5216 REGEX_TRANSFER(to, from);
5217 THREAD_ATOMIC_END;
5220 #define REGEX_CHAIN_HEAD(reg) do {\
5221 while (IS_NOT_NULL((reg)->chain)) {\
5222 (reg) = (reg)->chain;\
5224 } while (0)
5226 extern void
5227 onig_chain_link_add(regex_t* to, regex_t* add)
5229 THREAD_ATOMIC_START;
5230 REGEX_CHAIN_HEAD(to);
5231 to->chain = add;
5232 THREAD_ATOMIC_END;
5235 extern void
5236 onig_chain_reduce(regex_t* reg)
5238 regex_t *head, *prev;
5240 prev = reg;
5241 head = prev->chain;
5242 if (IS_NOT_NULL(head)) {
5243 reg->state = ONIG_STATE_MODIFY;
5244 while (IS_NOT_NULL(head->chain)) {
5245 prev = head;
5246 head = head->chain;
5248 prev->chain = (regex_t* )NULL;
5249 REGEX_TRANSFER(reg, head);
5253 #if 0
5254 extern int
5255 onig_clone(regex_t** to, regex_t* from)
5257 int r, size;
5258 regex_t* reg;
5260 #ifdef USE_MULTI_THREAD_SYSTEM
5261 if (ONIG_STATE(from) >= ONIG_STATE_NORMAL) {
5262 ONIG_STATE_INC(from);
5263 if (IS_NOT_NULL(from->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5264 onig_chain_reduce(from);
5265 ONIG_STATE_INC(from);
5268 else {
5269 int n = 0;
5270 while (ONIG_STATE(from) < ONIG_STATE_NORMAL) {
5271 if (++n > THREAD_PASS_LIMIT_COUNT)
5272 return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT;
5273 THREAD_PASS;
5275 ONIG_STATE_INC(from);
5277 #endif /* USE_MULTI_THREAD_SYSTEM */
5279 r = onig_alloc_init(&reg, ONIG_OPTION_NONE, ONIGENC_CASE_FOLD_DEFAULT,
5280 from->enc, ONIG_SYNTAX_DEFAULT);
5281 if (r != 0) {
5282 ONIG_STATE_DEC(from);
5283 return r;
5286 xmemcpy(reg, from, sizeof(onig_t));
5287 reg->chain = (regex_t* )NULL;
5288 reg->state = ONIG_STATE_NORMAL;
5290 if (from->p) {
5291 reg->p = (UChar* )xmalloc(reg->alloc);
5292 if (IS_NULL(reg->p)) goto mem_error;
5293 xmemcpy(reg->p, from->p, reg->alloc);
5296 if (from->exact) {
5297 reg->exact = (UChar* )xmalloc(from->exact_end - from->exact);
5298 if (IS_NULL(reg->exact)) goto mem_error;
5299 reg->exact_end = reg->exact + (from->exact_end - from->exact);
5300 xmemcpy(reg->exact, from->exact, reg->exact_end - reg->exact);
5303 if (from->int_map) {
5304 size = sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5305 reg->int_map = (int* )xmalloc(size);
5306 if (IS_NULL(reg->int_map)) goto mem_error;
5307 xmemcpy(reg->int_map, from->int_map, size);
5310 if (from->int_map_backward) {
5311 size = sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5312 reg->int_map_backward = (int* )xmalloc(size);
5313 if (IS_NULL(reg->int_map_backward)) goto mem_error;
5314 xmemcpy(reg->int_map_backward, from->int_map_backward, size);
5317 #ifdef USE_NAMED_GROUP
5318 reg->name_table = names_clone(from); /* names_clone is not implemented */
5319 #endif
5321 ONIG_STATE_DEC(from);
5322 *to = reg;
5323 return 0;
5325 mem_error:
5326 ONIG_STATE_DEC(from);
5327 return ONIGERR_MEMORY;
5329 #endif
5331 #ifdef ONIG_DEBUG
5332 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5333 #endif
5334 #ifdef ONIG_DEBUG_PARSE_TREE
5335 static void print_tree P_((FILE* f, Node* node));
5336 #endif
5338 extern int
5339 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5340 OnigErrorInfo* einfo)
5342 #define COMPILE_INIT_SIZE 20
5344 int r, init_size;
5345 Node* root;
5346 ScanEnv scan_env;
5347 #ifdef USE_SUBEXP_CALL
5348 UnsetAddrList uslist;
5349 #endif
5351 reg->state = ONIG_STATE_COMPILING;
5353 #ifdef ONIG_DEBUG
5354 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5355 #endif
5357 if (reg->alloc == 0) {
5358 init_size = (pattern_end - pattern) * 2;
5359 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5360 r = BBUF_INIT(reg, init_size);
5361 if (r != 0) goto end;
5363 else
5364 reg->used = 0;
5366 reg->num_mem = 0;
5367 reg->num_repeat = 0;
5368 reg->num_null_check = 0;
5369 reg->repeat_range_alloc = 0;
5370 reg->repeat_range = (OnigRepeatRange* )NULL;
5371 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5372 reg->num_comb_exp_check = 0;
5373 #endif
5375 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5376 if (r != 0) goto err;
5378 #ifdef USE_NAMED_GROUP
5379 /* mixed use named group and no-named group */
5380 if (scan_env.num_named > 0 &&
5381 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5382 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5383 if (scan_env.num_named != scan_env.num_mem)
5384 r = disable_noname_group_capture(&root, reg, &scan_env);
5385 else
5386 r = numbered_ref_check(root);
5388 if (r != 0) goto err;
5390 #endif
5392 #ifdef USE_SUBEXP_CALL
5393 if (scan_env.num_call > 0) {
5394 r = unset_addr_list_init(&uslist, scan_env.num_call);
5395 if (r != 0) goto err;
5396 scan_env.unset_addr_list = &uslist;
5397 r = setup_subexp_call(root, &scan_env);
5398 if (r != 0) goto err_unset;
5399 r = subexp_recursive_check_trav(root, &scan_env);
5400 if (r < 0) goto err_unset;
5401 r = subexp_inf_recursive_check_trav(root, &scan_env);
5402 if (r != 0) goto err_unset;
5404 reg->num_call = scan_env.num_call;
5406 else
5407 reg->num_call = 0;
5408 #endif
5410 r = setup_tree(root, reg, 0, &scan_env);
5411 if (r != 0) goto err_unset;
5413 #ifdef ONIG_DEBUG_PARSE_TREE
5414 print_tree(stderr, root);
5415 #endif
5417 reg->capture_history = scan_env.capture_history;
5418 reg->bt_mem_start = scan_env.bt_mem_start;
5419 reg->bt_mem_start |= reg->capture_history;
5420 if (IS_FIND_CONDITION(reg->options))
5421 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5422 else {
5423 reg->bt_mem_end = scan_env.bt_mem_end;
5424 reg->bt_mem_end |= reg->capture_history;
5427 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5428 if (scan_env.backrefed_mem == 0
5429 #ifdef USE_SUBEXP_CALL
5430 || scan_env.num_call == 0
5431 #endif
5433 setup_comb_exp_check(root, 0, &scan_env);
5434 #ifdef USE_SUBEXP_CALL
5435 if (scan_env.has_recursion != 0) {
5436 scan_env.num_comb_exp_check = 0;
5438 else
5439 #endif
5440 if (scan_env.comb_exp_max_regnum > 0) {
5441 int i;
5442 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5443 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5444 scan_env.num_comb_exp_check = 0;
5445 break;
5451 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5452 #endif
5454 clear_optimize_info(reg);
5455 #ifndef ONIG_DONT_OPTIMIZE
5456 r = set_optimize_info_from_tree(root, reg, &scan_env);
5457 if (r != 0) goto err_unset;
5458 #endif
5460 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5461 xfree(scan_env.mem_nodes_dynamic);
5462 scan_env.mem_nodes_dynamic = (Node** )NULL;
5465 r = compile_tree(root, reg);
5466 if (r == 0) {
5467 r = add_opcode(reg, OP_END);
5468 #ifdef USE_SUBEXP_CALL
5469 if (scan_env.num_call > 0) {
5470 r = unset_addr_list_fix(&uslist, reg);
5471 unset_addr_list_end(&uslist);
5472 if (r) goto err;
5474 #endif
5476 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5477 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5478 else {
5479 if (reg->bt_mem_start != 0)
5480 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5481 else
5482 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5485 #ifdef USE_SUBEXP_CALL
5486 else if (scan_env.num_call > 0) {
5487 unset_addr_list_end(&uslist);
5489 #endif
5490 onig_node_free(root);
5492 #ifdef ONIG_DEBUG_COMPILE
5493 #ifdef USE_NAMED_GROUP
5494 onig_print_names(stderr, reg);
5495 #endif
5496 print_compiled_byte_code_list(stderr, reg);
5497 #endif
5499 end:
5500 reg->state = ONIG_STATE_NORMAL;
5501 return r;
5503 err_unset:
5504 #ifdef USE_SUBEXP_CALL
5505 if (scan_env.num_call > 0) {
5506 unset_addr_list_end(&uslist);
5508 #endif
5509 err:
5510 if (IS_NOT_NULL(scan_env.error)) {
5511 if (IS_NOT_NULL(einfo)) {
5512 einfo->enc = scan_env.enc;
5513 einfo->par = scan_env.error;
5514 einfo->par_end = scan_env.error_end;
5518 onig_node_free(root);
5519 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5520 xfree(scan_env.mem_nodes_dynamic);
5521 return r;
5524 #ifdef USE_RECOMPILE_API
5525 extern int
5526 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5527 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5528 OnigErrorInfo* einfo)
5530 int r;
5531 regex_t *new_reg;
5533 r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5534 if (r) return r;
5535 if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5536 onig_transfer(reg, new_reg);
5538 else {
5539 onig_chain_link_add(reg, new_reg);
5541 return 0;
5543 #endif
5545 static int onig_inited = 0;
5547 extern int
5548 onig_alloc_init(regex_t** reg, OnigOptionType option,
5549 OnigCaseFoldType case_fold_flag,
5550 OnigEncoding enc, const OnigSyntaxType* syntax)
5552 if (! onig_inited)
5553 onig_init();
5555 if (ONIGENC_IS_UNDEF(enc))
5556 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5558 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5559 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5560 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5563 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5564 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5565 (*reg)->state = ONIG_STATE_MODIFY;
5567 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5568 option |= syntax->options;
5569 option &= ~ONIG_OPTION_SINGLELINE;
5571 else
5572 option |= syntax->options;
5574 (*reg)->enc = enc;
5575 (*reg)->options = option;
5576 (*reg)->syntax = syntax;
5577 (*reg)->optimize = 0;
5578 (*reg)->exact = (UChar* )NULL;
5579 (*reg)->int_map = (int* )NULL;
5580 (*reg)->int_map_backward = (int* )NULL;
5581 (*reg)->chain = (regex_t* )NULL;
5583 (*reg)->p = (UChar* )NULL;
5584 (*reg)->alloc = 0;
5585 (*reg)->used = 0;
5586 (*reg)->name_table = (void* )NULL;
5588 (*reg)->case_fold_flag = case_fold_flag;
5589 return 0;
5592 extern int
5593 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5594 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
5595 OnigErrorInfo* einfo)
5597 int r;
5599 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5601 r = onig_alloc_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT,
5602 enc, syntax);
5603 if (r) return r;
5605 r = onig_compile(*reg, pattern, pattern_end, einfo);
5606 if (r) {
5607 onig_free(*reg);
5608 *reg = NULL;
5610 return r;
5613 extern int
5614 onig_init(void)
5616 if (onig_inited != 0)
5617 return 0;
5619 THREAD_SYSTEM_INIT;
5620 THREAD_ATOMIC_START;
5622 onig_inited = 1;
5624 onigenc_init();
5625 /* onigenc_set_default_caseconv_table((UChar* )0); */
5627 #ifdef ONIG_DEBUG_STATISTICS
5628 onig_statistics_init();
5629 #endif
5631 THREAD_ATOMIC_END;
5632 return 0;
5636 extern int
5637 onig_end(void)
5639 THREAD_ATOMIC_START;
5641 #ifdef ONIG_DEBUG_STATISTICS
5642 onig_print_statistics(stderr);
5643 #endif
5645 #ifdef USE_SHARED_CCLASS_TABLE
5646 onig_free_shared_cclass_table();
5647 #endif
5649 #ifdef USE_PARSE_TREE_NODE_RECYCLE
5650 onig_free_node_list();
5651 #endif
5653 onig_inited = 0;
5655 THREAD_ATOMIC_END;
5656 THREAD_SYSTEM_END;
5657 return 0;
5660 extern int
5661 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5663 OnigCodePoint n, *data;
5664 OnigCodePoint low, high, x;
5666 GET_CODE_POINT(n, p);
5667 data = (OnigCodePoint* )p;
5668 data++;
5670 for (low = 0, high = n; low < high; ) {
5671 x = (low + high) >> 1;
5672 if (code > data[x * 2 + 1])
5673 low = x + 1;
5674 else
5675 high = x;
5678 return ((low < n && code >= data[low * 2]) ? 1 : 0);
5681 extern int
5682 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5684 int found;
5686 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5687 if (IS_NULL(cc->mbuf)) {
5688 found = 0;
5690 else {
5691 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5694 else {
5695 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5698 if (IS_NCCLASS_NOT(cc))
5699 return !found;
5700 else
5701 return found;
5704 extern int
5705 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5707 int len;
5709 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5710 len = 2;
5712 else {
5713 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5715 return onig_is_code_in_cc_len(len, code, cc);
5719 #ifdef ONIG_DEBUG
5721 /* arguments type */
5722 #define ARG_SPECIAL -1
5723 #define ARG_NON 0
5724 #define ARG_RELADDR 1
5725 #define ARG_ABSADDR 2
5726 #define ARG_LENGTH 3
5727 #define ARG_MEMNUM 4
5728 #define ARG_OPTION 5
5729 #define ARG_STATE_CHECK 6
5731 OnigOpInfoType OnigOpInfo[] = {
5732 { OP_FINISH, "finish", ARG_NON },
5733 { OP_END, "end", ARG_NON },
5734 { OP_EXACT1, "exact1", ARG_SPECIAL },
5735 { OP_EXACT2, "exact2", ARG_SPECIAL },
5736 { OP_EXACT3, "exact3", ARG_SPECIAL },
5737 { OP_EXACT4, "exact4", ARG_SPECIAL },
5738 { OP_EXACT5, "exact5", ARG_SPECIAL },
5739 { OP_EXACTN, "exactn", ARG_SPECIAL },
5740 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5741 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5742 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5743 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5744 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5745 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5746 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5747 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5748 { OP_CCLASS, "cclass", ARG_SPECIAL },
5749 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5750 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5751 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5752 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5753 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5754 { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5755 { OP_ANYCHAR, "anychar", ARG_NON },
5756 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5757 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5758 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5759 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5760 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5761 { OP_WORD, "word", ARG_NON },
5762 { OP_NOT_WORD, "not-word", ARG_NON },
5763 { OP_WORD_BOUND, "word-bound", ARG_NON },
5764 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5765 { OP_WORD_BEGIN, "word-begin", ARG_NON },
5766 { OP_WORD_END, "word-end", ARG_NON },
5767 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5768 { OP_END_BUF, "end-buf", ARG_NON },
5769 { OP_BEGIN_LINE, "begin-line", ARG_NON },
5770 { OP_END_LINE, "end-line", ARG_NON },
5771 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5772 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5773 { OP_BACKREF1, "backref1", ARG_NON },
5774 { OP_BACKREF2, "backref2", ARG_NON },
5775 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5776 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5777 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5778 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5779 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
5780 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5781 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5782 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5783 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5784 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5785 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5786 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5787 { OP_SET_OPTION, "set-option", ARG_OPTION },
5788 { OP_FAIL, "fail", ARG_NON },
5789 { OP_JUMP, "jump", ARG_RELADDR },
5790 { OP_PUSH, "push", ARG_RELADDR },
5791 { OP_POP, "pop", ARG_NON },
5792 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5793 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5794 { OP_REPEAT, "repeat", ARG_SPECIAL },
5795 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5796 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5797 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5798 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5799 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5800 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5801 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5802 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5803 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5804 { OP_PUSH_POS, "push-pos", ARG_NON },
5805 { OP_POP_POS, "pop-pos", ARG_NON },
5806 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5807 { OP_FAIL_POS, "fail-pos", ARG_NON },
5808 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5809 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5810 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5811 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5812 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5813 { OP_CALL, "call", ARG_ABSADDR },
5814 { OP_RETURN, "return", ARG_NON },
5815 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5816 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5817 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5818 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5819 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5820 "state-check-anychar-ml*", ARG_STATE_CHECK },
5821 { -1, "", ARG_NON }
5824 static char*
5825 op2name(int opcode)
5827 int i;
5829 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5830 if (opcode == OnigOpInfo[i].opcode)
5831 return OnigOpInfo[i].name;
5833 return "";
5836 static int
5837 op2arg_type(int opcode)
5839 int i;
5841 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5842 if (opcode == OnigOpInfo[i].opcode)
5843 return OnigOpInfo[i].arg_type;
5845 return ARG_SPECIAL;
5848 static void
5849 Indent(FILE* f, int indent)
5851 int i;
5852 for (i = 0; i < indent; i++) putc(' ', f);
5855 static void
5856 p_string(FILE* f, int len, UChar* s)
5858 fputs(":", f);
5859 while (len-- > 0) { fputc(*s++, f); }
5862 static void
5863 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5865 int x = len * mb_len;
5867 fprintf(f, ":%d:", len);
5868 while (x-- > 0) { fputc(*s++, f); }
5871 extern void
5872 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5873 OnigEncoding enc)
5875 int i, n, arg_type;
5876 RelAddrType addr;
5877 LengthType len;
5878 MemNumType mem;
5879 StateCheckNumType scn;
5880 OnigCodePoint code;
5881 UChar *q;
5883 fprintf(f, "[%s", op2name(*bp));
5884 arg_type = op2arg_type(*bp);
5885 if (arg_type != ARG_SPECIAL) {
5886 bp++;
5887 switch (arg_type) {
5888 case ARG_NON:
5889 break;
5890 case ARG_RELADDR:
5891 GET_RELADDR_INC(addr, bp);
5892 fprintf(f, ":(%d)", addr);
5893 break;
5894 case ARG_ABSADDR:
5895 GET_ABSADDR_INC(addr, bp);
5896 fprintf(f, ":(%d)", addr);
5897 break;
5898 case ARG_LENGTH:
5899 GET_LENGTH_INC(len, bp);
5900 fprintf(f, ":%d", len);
5901 break;
5902 case ARG_MEMNUM:
5903 mem = *((MemNumType* )bp);
5904 bp += SIZE_MEMNUM;
5905 fprintf(f, ":%d", mem);
5906 break;
5907 case ARG_OPTION:
5909 OnigOptionType option = *((OnigOptionType* )bp);
5910 bp += SIZE_OPTION;
5911 fprintf(f, ":%d", option);
5913 break;
5915 case ARG_STATE_CHECK:
5916 scn = *((StateCheckNumType* )bp);
5917 bp += SIZE_STATE_CHECK_NUM;
5918 fprintf(f, ":%d", scn);
5919 break;
5922 else {
5923 switch (*bp++) {
5924 case OP_EXACT1:
5925 case OP_ANYCHAR_STAR_PEEK_NEXT:
5926 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5927 p_string(f, 1, bp++); break;
5928 case OP_EXACT2:
5929 p_string(f, 2, bp); bp += 2; break;
5930 case OP_EXACT3:
5931 p_string(f, 3, bp); bp += 3; break;
5932 case OP_EXACT4:
5933 p_string(f, 4, bp); bp += 4; break;
5934 case OP_EXACT5:
5935 p_string(f, 5, bp); bp += 5; break;
5936 case OP_EXACTN:
5937 GET_LENGTH_INC(len, bp);
5938 p_len_string(f, len, 1, bp);
5939 bp += len;
5940 break;
5942 case OP_EXACTMB2N1:
5943 p_string(f, 2, bp); bp += 2; break;
5944 case OP_EXACTMB2N2:
5945 p_string(f, 4, bp); bp += 4; break;
5946 case OP_EXACTMB2N3:
5947 p_string(f, 6, bp); bp += 6; break;
5948 case OP_EXACTMB2N:
5949 GET_LENGTH_INC(len, bp);
5950 p_len_string(f, len, 2, bp);
5951 bp += len * 2;
5952 break;
5953 case OP_EXACTMB3N:
5954 GET_LENGTH_INC(len, bp);
5955 p_len_string(f, len, 3, bp);
5956 bp += len * 3;
5957 break;
5958 case OP_EXACTMBN:
5960 int mb_len;
5962 GET_LENGTH_INC(mb_len, bp);
5963 GET_LENGTH_INC(len, bp);
5964 fprintf(f, ":%d:%d:", mb_len, len);
5965 n = len * mb_len;
5966 while (n-- > 0) { fputc(*bp++, f); }
5968 break;
5970 case OP_EXACT1_IC:
5971 len = enclen(enc, bp);
5972 p_string(f, len, bp);
5973 bp += len;
5974 break;
5975 case OP_EXACTN_IC:
5976 GET_LENGTH_INC(len, bp);
5977 p_len_string(f, len, 1, bp);
5978 bp += len;
5979 break;
5981 case OP_CCLASS:
5982 n = bitset_on_num((BitSetRef )bp);
5983 bp += SIZE_BITSET;
5984 fprintf(f, ":%d", n);
5985 break;
5987 case OP_CCLASS_NOT:
5988 n = bitset_on_num((BitSetRef )bp);
5989 bp += SIZE_BITSET;
5990 fprintf(f, ":%d", n);
5991 break;
5993 case OP_CCLASS_MB:
5994 case OP_CCLASS_MB_NOT:
5995 GET_LENGTH_INC(len, bp);
5996 q = bp;
5997 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5998 ALIGNMENT_RIGHT(q);
5999 #endif
6000 GET_CODE_POINT(code, q);
6001 bp += len;
6002 fprintf(f, ":%d:%d", (int )code, len);
6003 break;
6005 case OP_CCLASS_MIX:
6006 case OP_CCLASS_MIX_NOT:
6007 n = bitset_on_num((BitSetRef )bp);
6008 bp += SIZE_BITSET;
6009 GET_LENGTH_INC(len, bp);
6010 q = bp;
6011 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6012 ALIGNMENT_RIGHT(q);
6013 #endif
6014 GET_CODE_POINT(code, q);
6015 bp += len;
6016 fprintf(f, ":%d:%d:%d", n, (int )code, len);
6017 break;
6019 case OP_CCLASS_NODE:
6021 CClassNode *cc;
6023 GET_POINTER_INC(cc, bp);
6024 n = bitset_on_num(cc->bs);
6025 fprintf(f, ":%u:%d", (unsigned int )cc, n);
6027 break;
6029 case OP_BACKREFN_IC:
6030 mem = *((MemNumType* )bp);
6031 bp += SIZE_MEMNUM;
6032 fprintf(f, ":%d", mem);
6033 break;
6035 case OP_BACKREF_MULTI_IC:
6036 case OP_BACKREF_MULTI:
6037 fputs(" ", f);
6038 GET_LENGTH_INC(len, bp);
6039 for (i = 0; i < len; i++) {
6040 GET_MEMNUM_INC(mem, bp);
6041 if (i > 0) fputs(", ", f);
6042 fprintf(f, "%d", mem);
6044 break;
6046 case OP_BACKREF_WITH_LEVEL:
6048 OnigOptionType option;
6049 LengthType level;
6051 GET_OPTION_INC(option, bp);
6052 fprintf(f, ":%d", option);
6053 GET_LENGTH_INC(level, bp);
6054 fprintf(f, ":%d", level);
6056 fputs(" ", f);
6057 GET_LENGTH_INC(len, bp);
6058 for (i = 0; i < len; i++) {
6059 GET_MEMNUM_INC(mem, bp);
6060 if (i > 0) fputs(", ", f);
6061 fprintf(f, "%d", mem);
6064 break;
6066 case OP_REPEAT:
6067 case OP_REPEAT_NG:
6069 mem = *((MemNumType* )bp);
6070 bp += SIZE_MEMNUM;
6071 addr = *((RelAddrType* )bp);
6072 bp += SIZE_RELADDR;
6073 fprintf(f, ":%d:%d", mem, addr);
6075 break;
6077 case OP_PUSH_OR_JUMP_EXACT1:
6078 case OP_PUSH_IF_PEEK_NEXT:
6079 addr = *((RelAddrType* )bp);
6080 bp += SIZE_RELADDR;
6081 fprintf(f, ":(%d)", addr);
6082 p_string(f, 1, bp);
6083 bp += 1;
6084 break;
6086 case OP_LOOK_BEHIND:
6087 GET_LENGTH_INC(len, bp);
6088 fprintf(f, ":%d", len);
6089 break;
6091 case OP_PUSH_LOOK_BEHIND_NOT:
6092 GET_RELADDR_INC(addr, bp);
6093 GET_LENGTH_INC(len, bp);
6094 fprintf(f, ":%d:(%d)", len, addr);
6095 break;
6097 case OP_STATE_CHECK_PUSH:
6098 case OP_STATE_CHECK_PUSH_OR_JUMP:
6099 scn = *((StateCheckNumType* )bp);
6100 bp += SIZE_STATE_CHECK_NUM;
6101 addr = *((RelAddrType* )bp);
6102 bp += SIZE_RELADDR;
6103 fprintf(f, ":%d:(%d)", scn, addr);
6104 break;
6106 default:
6107 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6108 *--bp);
6111 fputs("]", f);
6112 if (nextp) *nextp = bp;
6115 static void
6116 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6118 int ncode;
6119 UChar* bp = reg->p;
6120 UChar* end = reg->p + reg->used;
6122 fprintf(f, "code length: %d\n", reg->used);
6124 ncode = 0;
6125 while (bp < end) {
6126 ncode++;
6127 if (bp > reg->p) {
6128 if (ncode % 5 == 0)
6129 fprintf(f, "\n");
6130 else
6131 fputs(" ", f);
6133 onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6136 fprintf(f, "\n");
6139 static void
6140 print_indent_tree(FILE* f, Node* node, int indent)
6142 int i, type;
6143 int add = 3;
6144 UChar* p;
6146 Indent(f, indent);
6147 if (IS_NULL(node)) {
6148 fprintf(f, "ERROR: null node!!!\n");
6149 exit (0);
6152 type = NTYPE(node);
6153 switch (type) {
6154 case NT_LIST:
6155 case NT_ALT:
6156 if (NTYPE(node) == NT_LIST)
6157 fprintf(f, "<list:%x>\n", (int )node);
6158 else
6159 fprintf(f, "<alt:%x>\n", (int )node);
6161 print_indent_tree(f, NCAR(node), indent + add);
6162 while (IS_NOT_NULL(node = NCDR(node))) {
6163 if (NTYPE(node) != type) {
6164 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6165 exit(0);
6167 print_indent_tree(f, NCAR(node), indent + add);
6169 break;
6171 case NT_STR:
6172 fprintf(f, "<string%s:%x>",
6173 (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
6174 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6175 if (*p >= 0x20 && *p < 0x7f)
6176 fputc(*p, f);
6177 else {
6178 fprintf(f, " 0x%02x", *p);
6181 break;
6183 case NT_CCLASS:
6184 fprintf(f, "<cclass:%x>", (int )node);
6185 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6186 if (NCCLASS(node)->mbuf) {
6187 BBuf* bbuf = NCCLASS(node)->mbuf;
6188 for (i = 0; i < bbuf->used; i++) {
6189 if (i > 0) fprintf(f, ",");
6190 fprintf(f, "%0x", bbuf->p[i]);
6193 break;
6195 case NT_CTYPE:
6196 fprintf(f, "<ctype:%x> ", (int )node);
6197 switch (NCTYPE(node)->ctype) {
6198 case ONIGENC_CTYPE_WORD:
6199 if (NCTYPE(node)->not != 0)
6200 fputs("not word", f);
6201 else
6202 fputs("word", f);
6203 break;
6205 default:
6206 fprintf(f, "ERROR: undefined ctype.\n");
6207 exit(0);
6209 break;
6211 case NT_CANY:
6212 fprintf(f, "<anychar:%x>", (int )node);
6213 break;
6215 case NT_ANCHOR:
6216 fprintf(f, "<anchor:%x> ", (int )node);
6217 switch (NANCHOR(node)->type) {
6218 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6219 case ANCHOR_END_BUF: fputs("end buf", f); break;
6220 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6221 case ANCHOR_END_LINE: fputs("end line", f); break;
6222 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6223 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6225 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6226 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6227 #ifdef USE_WORD_BEGIN_END
6228 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6229 case ANCHOR_WORD_END: fputs("word end", f); break;
6230 #endif
6231 case ANCHOR_PREC_READ: fputs("prec read", f); break;
6232 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
6233 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
6234 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
6236 default:
6237 fprintf(f, "ERROR: undefined anchor type.\n");
6238 break;
6240 break;
6242 case NT_BREF:
6244 int* p;
6245 BRefNode* br = NBREF(node);
6246 p = BACKREFS_P(br);
6247 fprintf(f, "<backref:%x>", (int )node);
6248 for (i = 0; i < br->back_num; i++) {
6249 if (i > 0) fputs(", ", f);
6250 fprintf(f, "%d", p[i]);
6253 break;
6255 #ifdef USE_SUBEXP_CALL
6256 case NT_CALL:
6258 CallNode* cn = NCALL(node);
6259 fprintf(f, "<call:%x>", (int )node);
6260 p_string(f, cn->name_end - cn->name, cn->name);
6262 break;
6263 #endif
6265 case NT_QTFR:
6266 fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6267 NQTFR(node)->lower, NQTFR(node)->upper,
6268 (NQTFR(node)->greedy ? "" : "?"));
6269 print_indent_tree(f, NQTFR(node)->target, indent + add);
6270 break;
6272 case NT_ENCLOSE:
6273 fprintf(f, "<enclose:%x> ", (int )node);
6274 switch (NENCLOSE(node)->type) {
6275 case ENCLOSE_OPTION:
6276 fprintf(f, "option:%d\n", NENCLOSE(node)->option);
6277 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6278 break;
6279 case ENCLOSE_MEMORY:
6280 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6281 break;
6282 case ENCLOSE_STOP_BACKTRACK:
6283 fprintf(f, "stop-bt");
6284 break;
6286 default:
6287 break;
6289 fprintf(f, "\n");
6290 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6291 break;
6293 default:
6294 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6295 break;
6298 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6299 type != NT_ENCLOSE)
6300 fprintf(f, "\n");
6301 fflush(f);
6303 #endif /* ONIG_DEBUG */
6305 #ifdef ONIG_DEBUG_PARSE_TREE
6306 static void
6307 print_tree(FILE* f, Node* node)
6309 print_indent_tree(f, node, 0);
6311 #endif