1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004, 2005, 2007-2010 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
27 #include <sys/types.h>
35 #define isgraph(C) (isprint(C) && !isspace(C))
38 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
39 #define ISALPHA(C) isalpha(C)
40 #define ISUPPER(C) isupper(C)
41 #define ISLOWER(C) islower(C)
42 #define ISDIGIT(C) isdigit(C)
43 #define ISXDIGIT(C) isxdigit(C)
44 #define ISSPACE(C) isspace(C)
45 #define ISPUNCT(C) ispunct(C)
46 #define ISALNUM(C) isalnum(C)
47 #define ISPRINT(C) isprint(C)
48 #define ISGRAPH(C) isgraph(C)
49 #define ISCNTRL(C) iscntrl(C)
51 #define ISALPHA(C) (isascii(C) && isalpha(C))
52 #define ISUPPER(C) (isascii(C) && isupper(C))
53 #define ISLOWER(C) (isascii(C) && islower(C))
54 #define ISDIGIT(C) (isascii(C) && isdigit(C))
55 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
56 #define ISSPACE(C) (isascii(C) && isspace(C))
57 #define ISPUNCT(C) (isascii(C) && ispunct(C))
58 #define ISALNUM(C) (isascii(C) && isalnum(C))
59 #define ISPRINT(C) (isascii(C) && isprint(C))
60 #define ISGRAPH(C) (isascii(C) && isgraph(C))
61 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
64 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
65 - Its arg may be any int or unsigned int; it need not be an unsigned char.
66 - It's guaranteed to evaluate its argument exactly once.
67 - It's typically faster.
68 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
69 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
70 it's important to use the locale's definition of `digit' even when the
71 host does not conform to Posix. */
72 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
74 /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
76 #define _(str) gettext (str)
78 #include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
80 /* We can handle multibyte strings. */
83 # include <langinfo.h>
88 #include "hard-locale.h"
91 /* HPUX, define those as macros in sys/param.h */
99 static void dfamust (struct dfa
*dfa
);
100 static void regexp (int toplevel
);
102 #define CALLOC(p, t, n) ((p) = xcalloc((size_t)(n), sizeof (t)))
103 #define MALLOC(p, t, n) ((p) = xmalloc((n) * sizeof (t)))
104 #define REALLOC(p, t, n) ((p) = xrealloc((p), (n) * sizeof (t)))
106 /* Reallocate an array of type t if nalloc is too small for index. */
107 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
108 if ((index) >= (nalloc)) \
112 while ((index) >= (nalloc)); \
113 REALLOC(p, t, nalloc); \
124 fprintf(stderr
, "END");
125 else if (t
< NOTCHAR
)
126 fprintf(stderr
, "%c", t
);
131 case EMPTY
: s
= "EMPTY"; break;
132 case BACKREF
: s
= "BACKREF"; break;
133 case BEGLINE
: s
= "BEGLINE"; break;
134 case ENDLINE
: s
= "ENDLINE"; break;
135 case BEGWORD
: s
= "BEGWORD"; break;
136 case ENDWORD
: s
= "ENDWORD"; break;
137 case LIMWORD
: s
= "LIMWORD"; break;
138 case NOTLIMWORD
: s
= "NOTLIMWORD"; break;
139 case QMARK
: s
= "QMARK"; break;
140 case STAR
: s
= "STAR"; break;
141 case PLUS
: s
= "PLUS"; break;
142 case CAT
: s
= "CAT"; break;
143 case OR
: s
= "OR"; break;
144 case ORTOP
: s
= "ORTOP"; break;
145 case LPAREN
: s
= "LPAREN"; break;
146 case RPAREN
: s
= "RPAREN"; break;
148 case ANYCHAR
: s
= "ANYCHAR"; break;
149 case MBCSET
: s
= "MBCSET"; break;
150 #endif /* MBS_SUPPORT */
151 default: s
= "CSET"; break;
153 fprintf(stderr
, "%s", s
);
158 /* Stuff pertaining to charclasses. */
161 tstbit (unsigned int b
, charclass c
)
163 return c
[b
/ INTBITS
] & 1 << b
% INTBITS
;
167 setbit (unsigned int b
, charclass c
)
169 c
[b
/ INTBITS
] |= 1 << b
% INTBITS
;
173 clrbit (unsigned int b
, charclass c
)
175 c
[b
/ INTBITS
] &= ~(1 << b
% INTBITS
);
179 copyset (charclass src
, charclass dst
)
181 memcpy (dst
, src
, sizeof (charclass
));
185 zeroset (charclass s
)
187 memset (s
, 0, sizeof (charclass
));
195 for (i
= 0; i
< CHARCLASS_INTS
; ++i
)
200 equal (charclass s1
, charclass s2
)
202 return memcmp (s1
, s2
, sizeof (charclass
)) == 0;
205 /* A pointer to the current dfa is kept here during parsing. */
206 static struct dfa
*dfa
;
208 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
210 charclass_index (charclass s
)
214 for (i
= 0; i
< dfa
->cindex
; ++i
)
215 if (equal(s
, dfa
->charclasses
[i
]))
217 REALLOC_IF_NECESSARY(dfa
->charclasses
, charclass
, dfa
->calloc
, dfa
->cindex
);
219 copyset(s
, dfa
->charclasses
[i
]);
223 /* Syntax bits controlling the behavior of the lexical analyzer. */
224 static reg_syntax_t syntax_bits
, syntax_bits_set
;
226 /* Flag for case-folding letters into sets. */
227 static int case_fold
;
229 /* End-of-line byte in data. */
230 static unsigned char eolbyte
;
232 /* Entry point to set syntax options. */
234 dfasyntax (reg_syntax_t bits
, int fold
, unsigned char eol
)
242 /* Like setbit, but if case is folded, set both cases of a letter.
243 For MB_CUR_MAX > 1, one or both of the two cases may not be set,
244 so the resulting charset may only be used as an optimization. */
259 wint_t b1
= iswupper(b
) ? towlower(b
) : b
;
260 wint_t b2
= iswlower(b
) ? towupper(b
) : b
;
261 if (wctob ((unsigned char)b1
) == b1
)
263 if (b2
!= b1
&& wctob ((unsigned char)b2
) == b2
)
269 unsigned char b1
= ISUPPER(b
) ? tolower(b
) : b
;
270 unsigned char b2
= ISLOWER(b
) ? toupper(b
) : b
;
279 if (wctob ((unsigned char)b
) == b
)
286 /* UTF-8 encoding allows some optimizations that we can't otherwise
287 assume in a multibyte encoding. */
291 static int utf8
= -1;
294 #if defined HAVE_LANGINFO_CODESET && defined MBS_SUPPORT
295 utf8
= (strcmp (nl_langinfo (CODESET
), "UTF-8") == 0);
304 /* Lexical analyzer. All the dross that deals with the obnoxious
305 GNU Regex syntax bits is located here. The poor, suffering
306 reader is referred to the GNU Regex documentation for the
307 meaning of the @#%!@#%^!@ syntax bits. */
309 static char const *lexptr
; /* Pointer to next input character. */
310 static int lexleft
; /* Number of characters remaining. */
311 static token lasttok
; /* Previous token returned; initially END. */
312 static int laststart
; /* True if we're separated from beginning or (, |
313 only by zero-width characters. */
314 static int parens
; /* Count of outstanding left parens. */
315 static int minrep
, maxrep
; /* Repeat counts for {m,n}. */
316 static int hard_LC_COLLATE
; /* Nonzero if LC_COLLATE is hard. */
318 static int cur_mb_len
= 1; /* Length of the multibyte representation of
321 /* These variables are used only if (MB_CUR_MAX > 1). */
322 static mbstate_t mbs
; /* Mbstate for mbrlen(). */
323 static wchar_t wctok
; /* Wide character representation of the current
324 multibyte character. */
325 static unsigned char *mblen_buf
;/* Correspond to the input buffer in dfaexec().
326 Each element store the amount of remain
327 byte of corresponding multibyte character
328 in the input string. A element's value
329 is 0 if corresponding character is a
330 single byte chracter.
331 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
332 mblen_buf : 0, 3, 2, 1
334 static wchar_t *inputwcs
; /* Wide character representation of input
336 The length of this array is same as
337 the length of input string(char array).
338 inputstring[i] is a single-byte char,
339 or 1st byte of a multibyte char.
340 And inputwcs[i] is the codepoint. */
341 static unsigned char const *buf_begin
; /* reference to begin in dfaexec(). */
342 static unsigned char const *buf_end
; /* reference to end in dfaexec(). */
343 #endif /* MBS_SUPPORT */
347 /* Note that characters become unsigned here. */
348 # define FETCH_WC(c, wc, eoferr) \
355 return lasttok = END; \
360 cur_mb_len = mbrtowc(&_wc, lexptr, lexleft, &mbs); \
361 if (cur_mb_len <= 0) \
365 (wc) = (c) = (unsigned char) *lexptr++; \
369 lexptr += cur_mb_len; \
370 lexleft -= cur_mb_len; \
377 # define FETCH(c, eoferr) \
380 FETCH_WC(c, wc, eoferr); \
384 /* Note that characters become unsigned here. */
385 # define FETCH(c, eoferr) \
392 return lasttok = END; \
394 (c) = (unsigned char) *lexptr++; \
398 # define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
400 #endif /* MBS_SUPPORT */
403 in_coll_range (char ch
, char from
, char to
)
405 char c
[6] = { from
, 0, ch
, 0, to
, 0 };
406 return strcoll (&c
[0], &c
[2]) <= 0 && strcoll (&c
[2], &c
[4]) <= 0;
409 static int is_alpha(int c
) { return ISALPHA(c
); }
410 static int is_upper(int c
) { return ISUPPER(c
); }
411 static int is_lower(int c
) { return ISLOWER(c
); }
412 static int is_digit(int c
) { return ISDIGIT(c
); }
413 static int is_xdigit(int c
) { return ISXDIGIT(c
); }
414 static int is_space(int c
) { return ISSPACE(c
); }
415 static int is_punct(int c
) { return ISPUNCT(c
); }
416 static int is_alnum(int c
) { return ISALNUM(c
); }
417 static int is_print(int c
) { return ISPRINT(c
); }
418 static int is_graph(int c
) { return ISGRAPH(c
); }
419 static int is_cntrl(int c
) { return ISCNTRL(c
); }
424 return (c
== ' ' || c
== '\t');
427 typedef int predicate (int);
429 /* The following list maps the names of the Posix named character classes
430 to predicate functions that determine whether a given character is in
431 the class. The leading [ has already been eaten by the lexical analyzer. */
435 } const prednames
[] = {
436 { "alpha", is_alpha
},
437 { "upper", is_upper
},
438 { "lower", is_lower
},
439 { "digit", is_digit
},
440 { "xdigit", is_xdigit
},
441 { "space", is_space
},
442 { "punct", is_punct
},
443 { "alnum", is_alnum
},
444 { "print", is_print
},
445 { "graph", is_graph
},
446 { "cntrl", is_cntrl
},
447 { "blank", is_blank
},
452 find_pred (const char *str
)
455 for (i
= 0; prednames
[i
].name
; ++i
)
456 if (!strcmp(str
, prednames
[i
].name
))
459 return prednames
[i
].pred
;
462 /* Multibyte character handling sub-routine for lex.
463 This function parse a bracket expression and build a struct
466 parse_bracket_exp (void)
475 /* Work area to build a mb_char_classes. */
476 struct mb_char_classes
*work_mbc
;
477 int chars_al
, range_sts_al
, range_ends_al
, ch_classes_al
,
478 equivs_al
, coll_elems_al
;
481 range_sts_al
= range_ends_al
= 0;
482 ch_classes_al
= equivs_al
= coll_elems_al
= 0;
485 REALLOC_IF_NECESSARY(dfa
->mbcsets
, struct mb_char_classes
,
486 dfa
->mbcsets_alloc
, dfa
->nmbcsets
+ 1);
488 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
489 We will update dfa->multibyte_prop[] in addtok(), because we can't
490 decide the index in dfa->tokens[]. */
492 /* Initialize work area. */
493 work_mbc
= &(dfa
->mbcsets
[dfa
->nmbcsets
++]);
494 memset (work_mbc
, 0, sizeof *work_mbc
);
500 memset (ccl
, 0, sizeof(ccl
));
501 FETCH_WC (c
, wc
, _("unbalanced ["));
504 FETCH_WC (c
, wc
, _("unbalanced ["));
512 c1
= EOF
; /* mark c1 is not initialized". */
514 /* Note that if we're looking at some other [:...:] construct,
515 we just treat it as a bunch of ordinary characters. We can do
516 this because we assume regex has checked for syntax errors before
517 dfa is ever called. */
518 if (c
== '[' && (syntax_bits
& RE_CHAR_CLASSES
))
520 #define BRACKET_BUFFER_SIZE 128
521 char str
[BRACKET_BUFFER_SIZE
];
522 FETCH_WC (c1
, wc1
, _("unbalanced ["));
524 /* If pattern contains `[[:', `[[.', or `[[='. */
527 /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
528 || (MB_CUR_MAX
> 1 && (c1
== '.' || c1
== '='))
535 FETCH_WC (c
, wc
, _("unbalanced ["));
536 if ((c
== c1
&& *lexptr
== ']') || lexleft
== 0)
538 if (len
< BRACKET_BUFFER_SIZE
)
541 /* This is in any case an invalid class name. */
547 FETCH_WC (c
, wc
, _("unbalanced ["));
549 /* build character class. */
552 = (case_fold
&& (!strcmp (str
, "upper")
553 || !strcmp (str
, "lower"))
559 /* Store the character class as wctype_t. */
560 wctype_t wt
= wctype (class);
562 if (ch_classes_al
== 0)
563 MALLOC(work_mbc
->ch_classes
, wctype_t, ++ch_classes_al
);
564 REALLOC_IF_NECESSARY(work_mbc
->ch_classes
, wctype_t,
566 work_mbc
->nch_classes
+ 1);
567 work_mbc
->ch_classes
[work_mbc
->nch_classes
++] = wt
;
572 predicate
*pred
= find_pred (class);
574 dfaerror(_("invalid character class"));
575 for (c2
= 0; c2
< NOTCHAR
; ++c2
)
577 setbit_case_fold (c2
, ccl
);
582 else if (c1
== '=' || c1
== '.')
585 MALLOC(elem
, char, len
+ 1);
586 strncpy(elem
, str
, len
+ 1);
589 /* build equivalent class. */
592 MALLOC(work_mbc
->equivs
, char*, ++equivs_al
);
593 REALLOC_IF_NECESSARY(work_mbc
->equivs
, char*,
595 work_mbc
->nequivs
+ 1);
596 work_mbc
->equivs
[work_mbc
->nequivs
++] = elem
;
600 /* build collating element. */
602 if (coll_elems_al
== 0)
603 MALLOC(work_mbc
->coll_elems
, char*, ++coll_elems_al
);
604 REALLOC_IF_NECESSARY(work_mbc
->coll_elems
, char*,
606 work_mbc
->ncoll_elems
+ 1);
607 work_mbc
->coll_elems
[work_mbc
->ncoll_elems
++] = elem
;
612 /* Fetch new lookahead character. */
613 FETCH_WC (c1
, wc1
, _("unbalanced ["));
617 /* We treat '[' as a normal character here. c/c1/wc/wc1
618 are already set up. */
621 if (c
== '\\' && (syntax_bits
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
622 FETCH_WC(c
, wc
, _("unbalanced ["));
625 FETCH_WC(c1
, wc1
, _("unbalanced ["));
628 /* build range characters. */
630 FETCH_WC(c2
, wc2
, _("unbalanced ["));
633 /* In the case [x-], the - is an ordinary hyphen,
634 which is left in c1, the lookahead character. */
635 lexptr
-= cur_mb_len
;
636 lexleft
+= cur_mb_len
;
640 if (c1
== '-' && c2
!= ']')
643 && (syntax_bits
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
644 FETCH_WC(c2
, wc2
, _("unbalanced ["));
649 /* When case folding map a range, say [m-z] (or even [M-z])
650 to the pair of ranges, [m-z] [M-Z]. */
651 if (range_sts_al
== 0)
653 MALLOC(work_mbc
->range_sts
, wchar_t, ++range_sts_al
);
654 MALLOC(work_mbc
->range_ends
, wchar_t, ++range_ends_al
);
656 REALLOC_IF_NECESSARY(work_mbc
->range_sts
, wchar_t,
657 range_sts_al
, work_mbc
->nranges
+ 1);
658 REALLOC_IF_NECESSARY(work_mbc
->range_ends
, wchar_t,
659 range_ends_al
, work_mbc
->nranges
+ 1);
660 work_mbc
->range_sts
[work_mbc
->nranges
] =
661 case_fold
? towlower(wc
) : (wchar_t)wc
;
662 work_mbc
->range_ends
[work_mbc
->nranges
++] =
663 case_fold
? towlower(wc2
) : (wchar_t)wc2
;
666 if (case_fold
&& (iswalpha(wc
) || iswalpha(wc2
)))
668 REALLOC_IF_NECESSARY(work_mbc
->range_sts
, wchar_t,
669 range_sts_al
, work_mbc
->nranges
+ 1);
670 work_mbc
->range_sts
[work_mbc
->nranges
] = towupper(wc
);
671 REALLOC_IF_NECESSARY(work_mbc
->range_ends
, wchar_t,
672 range_ends_al
, work_mbc
->nranges
+ 1);
673 work_mbc
->range_ends
[work_mbc
->nranges
++] = towupper(wc2
);
686 if (!hard_LC_COLLATE
)
687 for (c
= c1
; c
<= c2
; c
++)
688 setbit_case_fold (c
, ccl
);
690 for (c
= 0; c
< NOTCHAR
; ++c
)
691 if (!(case_fold
&& ISUPPER (c
))
692 && in_coll_range (c
, c1
, c2
))
693 setbit_case_fold (c
, ccl
);
696 FETCH_WC(c1
, wc1
, _("unbalanced ["));
701 /* Build normal characters. */
702 setbit_case_fold (wc
, ccl
);
705 if (case_fold
&& iswalpha(wc
))
709 if (c
== EOF
|| (wint_t)c
!= (wint_t)wc
)
711 REALLOC_IF_NECESSARY(work_mbc
->chars
, wchar_t, chars_al
,
712 work_mbc
->nchars
+ 1);
713 work_mbc
->chars
[work_mbc
->nchars
++] = wc
;
722 if (c
== EOF
|| (wint_t)c
!= (wint_t)wc
)
724 REALLOC_IF_NECESSARY(work_mbc
->chars
, wchar_t, chars_al
,
725 work_mbc
->nchars
+ 1);
726 work_mbc
->chars
[work_mbc
->nchars
++] = wc
;
730 setbit_case_fold (c
, ccl
);
743 || work_mbc
->nchars
!= 0
744 || work_mbc
->nch_classes
!= 0
745 || work_mbc
->nranges
!= 0
746 || work_mbc
->nequivs
!= 0
747 || work_mbc
->ncoll_elems
!= 0))
749 static charclass zeroclass
;
750 work_mbc
->invert
= invert
;
751 work_mbc
->cset
= equal(ccl
, zeroclass
) ? -1 : charclass_index(ccl
);
759 assert(MB_CUR_MAX
== 1);
762 if (syntax_bits
& RE_HAT_LISTS_NOT_NEWLINE
)
763 clrbit(eolbyte
, ccl
);
766 return CSET
+ charclass_index(ccl
);
769 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
770 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
780 /* Basic plan: We fetch a character. If it's a backslash,
781 we set the backslash flag and go through the loop again.
782 On the plus side, this avoids having a duplicate of the
783 main switch inside the backslash case. On the minus side,
784 it means that just about every case begins with
785 "if (backslash) ...". */
786 for (i
= 0; i
< 2; ++i
)
791 FETCH_WC (c
, wctok
, NULL
);
796 #endif /* MBS_SUPPORT */
805 dfaerror(_("unfinished \\ escape"));
812 if (syntax_bits
& RE_CONTEXT_INDEP_ANCHORS
816 return lasttok
= BEGLINE
;
822 if (syntax_bits
& RE_CONTEXT_INDEP_ANCHORS
824 || (syntax_bits
& RE_NO_BK_PARENS
825 ? lexleft
> 0 && *lexptr
== ')'
826 : lexleft
> 1 && lexptr
[0] == '\\' && lexptr
[1] == ')')
827 || (syntax_bits
& RE_NO_BK_VBAR
828 ? lexleft
> 0 && *lexptr
== '|'
829 : lexleft
> 1 && lexptr
[0] == '\\' && lexptr
[1] == '|')
830 || ((syntax_bits
& RE_NEWLINE_ALT
)
831 && lexleft
> 0 && *lexptr
== '\n'))
832 return lasttok
= ENDLINE
;
844 if (backslash
&& !(syntax_bits
& RE_NO_BK_REFS
))
847 return lasttok
= BACKREF
;
852 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
853 return lasttok
= BEGLINE
; /* FIXME: should be beginning of string */
857 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
858 return lasttok
= ENDLINE
; /* FIXME: should be end of string */
862 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
863 return lasttok
= BEGWORD
;
867 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
868 return lasttok
= ENDWORD
;
872 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
873 return lasttok
= LIMWORD
;
877 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
878 return lasttok
= NOTLIMWORD
;
882 if (syntax_bits
& RE_LIMITED_OPS
)
884 if (backslash
!= ((syntax_bits
& RE_BK_PLUS_QM
) != 0))
886 if (!(syntax_bits
& RE_CONTEXT_INDEP_OPS
) && laststart
)
888 return lasttok
= QMARK
;
893 if (!(syntax_bits
& RE_CONTEXT_INDEP_OPS
) && laststart
)
895 return lasttok
= STAR
;
898 if (syntax_bits
& RE_LIMITED_OPS
)
900 if (backslash
!= ((syntax_bits
& RE_BK_PLUS_QM
) != 0))
902 if (!(syntax_bits
& RE_CONTEXT_INDEP_OPS
) && laststart
)
904 return lasttok
= PLUS
;
907 if (!(syntax_bits
& RE_INTERVALS
))
909 if (backslash
!= ((syntax_bits
& RE_NO_BK_BRACES
) == 0))
911 if (!(syntax_bits
& RE_CONTEXT_INDEP_OPS
) && laststart
)
914 if (syntax_bits
& RE_NO_BK_BRACES
)
916 /* Scan ahead for a valid interval; if it's not valid,
917 treat it as a literal '{'. */
918 int lo
= -1, hi
= -1;
919 char const *p
= lexptr
;
920 char const *lim
= p
+ lexleft
;
921 for (; p
!= lim
&& ISASCIIDIGIT (*p
); p
++)
922 lo
= (lo
< 0 ? 0 : lo
* 10) + *p
- '0';
923 if (p
!= lim
&& *p
== ',')
924 while (++p
!= lim
&& ISASCIIDIGIT (*p
))
925 hi
= (hi
< 0 ? 0 : hi
* 10) + *p
- '0';
928 if (p
== lim
|| *p
!= '}'
929 || lo
< 0 || RE_DUP_MAX
< hi
|| (0 <= hi
&& hi
< lo
))
936 {M,} - minimum count, maximum is infinity
937 {M,N} - M through N */
938 FETCH(c
, _("unfinished repeat count"));
939 if (ISASCIIDIGIT (c
))
944 FETCH(c
, _("unfinished repeat count"));
945 if (! ISASCIIDIGIT (c
))
947 minrep
= 10 * minrep
+ c
- '0';
951 dfaerror(_("malformed repeat count"));
954 FETCH (c
, _("unfinished repeat count"));
955 if (! ISASCIIDIGIT (c
))
962 FETCH (c
, _("unfinished repeat count"));
963 if (! ISASCIIDIGIT (c
))
965 maxrep
= 10 * maxrep
+ c
- '0';
967 if (0 <= maxrep
&& maxrep
< minrep
)
968 dfaerror (_("malformed repeat count"));
973 if (!(syntax_bits
& RE_NO_BK_BRACES
))
976 dfaerror(_("malformed repeat count"));
977 FETCH(c
, _("unfinished repeat count"));
980 dfaerror(_("malformed repeat count"));
983 dfa
->broken
= (minrep
== maxrep
&& minrep
== 0);
985 return lasttok
= REPMN
;
988 if (syntax_bits
& RE_LIMITED_OPS
)
990 if (backslash
!= ((syntax_bits
& RE_NO_BK_VBAR
) == 0))
996 if (syntax_bits
& RE_LIMITED_OPS
998 || !(syntax_bits
& RE_NEWLINE_ALT
))
1001 return lasttok
= OR
;
1004 if (backslash
!= ((syntax_bits
& RE_NO_BK_PARENS
) == 0))
1008 return lasttok
= LPAREN
;
1011 if (backslash
!= ((syntax_bits
& RE_NO_BK_PARENS
) == 0))
1013 if (parens
== 0 && syntax_bits
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
1017 return lasttok
= RPAREN
;
1025 /* In multibyte environment period must match with a single
1026 character not a byte. So we use ANYCHAR. */
1028 return lasttok
= ANYCHAR
;
1030 #endif /* MBS_SUPPORT */
1033 if (!(syntax_bits
& RE_DOT_NEWLINE
))
1034 clrbit(eolbyte
, ccl
);
1035 if (syntax_bits
& RE_DOT_NOT_NULL
)
1038 return lasttok
= CSET
+ charclass_index(ccl
);
1043 if (!backslash
|| (syntax_bits
& RE_NO_GNU_OPS
))
1046 for (c2
= 0; c2
< NOTCHAR
; ++c2
)
1052 return lasttok
= CSET
+ charclass_index(ccl
);
1057 if (!backslash
|| (syntax_bits
& RE_NO_GNU_OPS
))
1060 for (c2
= 0; c2
< NOTCHAR
; ++c2
)
1061 if (IS_WORD_CONSTITUENT(c2
))
1066 return lasttok
= CSET
+ charclass_index(ccl
);
1072 return lasttok
= parse_bracket_exp();
1078 /* For multibyte character sets, folding is done in atom. Always
1081 return lasttok
= WCHAR
;
1084 if (case_fold
&& ISALPHA(c
))
1087 setbit_case_fold (c
, ccl
);
1088 return lasttok
= CSET
+ charclass_index(ccl
);
1095 /* The above loop should consume at most a backslash
1096 and some other character. */
1098 return END
; /* keeps pedantic compilers happy. */
1101 /* Recursive descent parser for regular expressions. */
1103 static token tok
; /* Lookahead token. */
1104 static int depth
; /* Current depth of a hypothetical stack
1105 holding deferred productions. This is
1106 used to determine the depth that will be
1107 required of the real stack later on in
1111 addtok_mb (token t
, int mbprop
)
1116 REALLOC_IF_NECESSARY(dfa
->multibyte_prop
, int, dfa
->nmultibyte_prop
,
1118 dfa
->multibyte_prop
[dfa
->tindex
] = mbprop
;
1124 REALLOC_IF_NECESSARY(dfa
->tokens
, token
, dfa
->talloc
, dfa
->tindex
);
1125 dfa
->tokens
[dfa
->tindex
++] = t
;
1146 if (depth
> dfa
->depth
)
1150 /* Add the given token to the parse tree, maintaining the depth count and
1151 updating the maximum depth if necessary. */
1156 if (MB_CUR_MAX
> 1 && t
== MBCSET
)
1157 addtok_mb (MBCSET
, ((dfa
->nmbcsets
- 1) << 2) + 3);
1164 /* We treat a multibyte character as a single atom, so that DFA
1165 can treat a multibyte character as a single expression.
1167 e.g. We construct following tree from "<mb1><mb2>".
1168 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1169 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1171 addtok_wc (wint_t wc
)
1173 unsigned char buf
[MB_LEN_MAX
];
1176 memset (&s
, 0, sizeof(s
));
1177 cur_mb_len
= wcrtomb ((char *) buf
, wc
, &s
);
1178 addtok_mb(buf
[0], cur_mb_len
== 1 ? 3 : 1);
1179 for (i
= 1; i
< cur_mb_len
; i
++)
1181 addtok_mb(buf
[i
], i
== cur_mb_len
- 1 ? 2 : 0);
1187 /* The grammar understood by the parser is as follows.
1206 <multibyte character>
1217 LPAREN regexp RPAREN
1220 The parser builds a parse tree in postfix form in an array of tokens. */
1228 addtok_wc (case_fold
? towlower(wctok
) : wctok
);
1230 if (case_fold
&& iswalpha(wctok
))
1232 addtok_wc (towupper(wctok
));
1240 #endif /* MBS_SUPPORT */
1242 if ((tok
>= 0 && tok
< NOTCHAR
) || tok
>= CSET
|| tok
== BACKREF
1243 || tok
== BEGLINE
|| tok
== ENDLINE
|| tok
== BEGWORD
1245 || tok
== ANYCHAR
|| tok
== MBCSET
/* MB_CUR_MAX > 1 */
1246 #endif /* MBS_SUPPORT */
1247 || tok
== ENDWORD
|| tok
== LIMWORD
|| tok
== NOTLIMWORD
)
1252 else if (tok
== LPAREN
)
1257 dfaerror(_("unbalanced ("));
1264 /* Return the number of tokens in the given subexpression. */
1266 nsubtoks (int tindex
)
1270 switch (dfa
->tokens
[tindex
- 1])
1277 return 1 + nsubtoks(tindex
- 1);
1281 ntoks1
= nsubtoks(tindex
- 1);
1282 return 1 + ntoks1
+ nsubtoks(tindex
- 1 - ntoks1
);
1286 /* Copy the given subexpression to the top of the tree. */
1288 copytoks (int tindex
, int ntokens
)
1292 for (i
= 0; i
< ntokens
; ++i
)
1294 addtok(dfa
->tokens
[tindex
+ i
]);
1296 /* Update index into multibyte csets. */
1297 if (MB_CUR_MAX
> 1 && dfa
->tokens
[tindex
+ i
] == MBCSET
)
1298 dfa
->multibyte_prop
[dfa
->tindex
- 1] = dfa
->multibyte_prop
[tindex
+ i
];
1306 int tindex
, ntokens
, i
;
1309 while (tok
== QMARK
|| tok
== STAR
|| tok
== PLUS
|| tok
== REPMN
)
1312 ntokens
= nsubtoks(dfa
->tindex
);
1313 tindex
= dfa
->tindex
- ntokens
;
1318 for (i
= 1; i
< minrep
; ++i
)
1320 copytoks(tindex
, ntokens
);
1323 for (; i
< maxrep
; ++i
)
1325 copytoks(tindex
, ntokens
);
1342 while (tok
!= RPAREN
&& tok
!= OR
&& tok
>= 0)
1350 regexp (int toplevel
)
1364 /* Main entry point for the parser. S is a string to be parsed, len is the
1365 length of the string, so s can include NUL characters. D is a pointer to
1366 the struct dfa to parse into. */
1368 dfaparse (char const *s
, size_t len
, struct dfa
*d
)
1377 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
1383 memset(&mbs
, 0, sizeof(mbstate_t));
1385 #endif /* MBS_SUPPORT */
1387 if (! syntax_bits_set
)
1388 dfaerror(_("no syntax specified"));
1396 dfaerror(_("unbalanced )"));
1398 addtok(END
- d
->nregexps
);
1407 /* Some primitives for operating on sets of positions. */
1409 /* Copy one set to another; the destination must be large enough. */
1411 copy (position_set
const *src
, position_set
*dst
)
1415 for (i
= 0; i
< src
->nelem
; ++i
)
1416 dst
->elems
[i
] = src
->elems
[i
];
1417 dst
->nelem
= src
->nelem
;
1420 /* Insert a position in a set. Position sets are maintained in sorted
1421 order according to index. If position already exists in the set with
1422 the same index then their constraints are logically or'd together.
1423 S->elems must point to an array large enough to hold the resulting set. */
1425 insert (position p
, position_set
*s
)
1427 int count
= s
->nelem
;
1428 int lo
= 0, hi
= count
;
1431 int mid
= ((unsigned) lo
+ (unsigned) hi
) >> 1;
1432 if (s
->elems
[mid
].index
< p
.index
)
1438 if (lo
< count
&& p
.index
== s
->elems
[lo
].index
)
1439 s
->elems
[lo
].constraint
|= p
.constraint
;
1443 for (i
= count
; i
> lo
; i
--)
1444 s
->elems
[i
] = s
->elems
[i
- 1];
1450 /* Merge two sets of positions into a third. The result is exactly as if
1451 the positions of both sets were inserted into an initially empty set. */
1453 merge (position_set
const *s1
, position_set
const *s2
, position_set
*m
)
1458 while (i
< s1
->nelem
&& j
< s2
->nelem
)
1459 if (s1
->elems
[i
].index
> s2
->elems
[j
].index
)
1460 m
->elems
[m
->nelem
++] = s1
->elems
[i
++];
1461 else if (s1
->elems
[i
].index
< s2
->elems
[j
].index
)
1462 m
->elems
[m
->nelem
++] = s2
->elems
[j
++];
1465 m
->elems
[m
->nelem
] = s1
->elems
[i
++];
1466 m
->elems
[m
->nelem
++].constraint
|= s2
->elems
[j
++].constraint
;
1468 while (i
< s1
->nelem
)
1469 m
->elems
[m
->nelem
++] = s1
->elems
[i
++];
1470 while (j
< s2
->nelem
)
1471 m
->elems
[m
->nelem
++] = s2
->elems
[j
++];
1474 /* Delete a position from a set. */
1476 delete (position p
, position_set
*s
)
1480 for (i
= 0; i
< s
->nelem
; ++i
)
1481 if (p
.index
== s
->elems
[i
].index
)
1484 for (--s
->nelem
; i
< s
->nelem
; ++i
)
1485 s
->elems
[i
] = s
->elems
[i
+ 1];
1488 /* Find the index of the state corresponding to the given position set with
1489 the given preceding context, or create a new state if there is no such
1490 state. Newline and letter tell whether we got here on a newline or
1491 letter, respectively. */
1493 state_index (struct dfa
*d
, position_set
const *s
, int newline
, int letter
)
1499 newline
= newline
? 1 : 0;
1500 letter
= letter
? 1 : 0;
1502 for (i
= 0; i
< s
->nelem
; ++i
)
1503 hash
^= s
->elems
[i
].index
+ s
->elems
[i
].constraint
;
1505 /* Try to find a state that exactly matches the proposed one. */
1506 for (i
= 0; i
< d
->sindex
; ++i
)
1508 if (hash
!= d
->states
[i
].hash
|| s
->nelem
!= d
->states
[i
].elems
.nelem
1509 || newline
!= d
->states
[i
].newline
|| letter
!= d
->states
[i
].letter
)
1511 for (j
= 0; j
< s
->nelem
; ++j
)
1512 if (s
->elems
[j
].constraint
1513 != d
->states
[i
].elems
.elems
[j
].constraint
1514 || s
->elems
[j
].index
!= d
->states
[i
].elems
.elems
[j
].index
)
1520 /* We'll have to create a new state. */
1521 REALLOC_IF_NECESSARY(d
->states
, dfa_state
, d
->salloc
, d
->sindex
);
1522 d
->states
[i
].hash
= hash
;
1523 MALLOC(d
->states
[i
].elems
.elems
, position
, s
->nelem
);
1524 copy(s
, &d
->states
[i
].elems
);
1525 d
->states
[i
].newline
= newline
;
1526 d
->states
[i
].letter
= letter
;
1527 d
->states
[i
].backref
= 0;
1528 d
->states
[i
].constraint
= 0;
1529 d
->states
[i
].first_end
= 0;
1531 d
->states
[i
].mbps
.nelem
= 0;
1532 d
->states
[i
].mbps
.elems
= NULL
;
1534 for (j
= 0; j
< s
->nelem
; ++j
)
1535 if (d
->tokens
[s
->elems
[j
].index
] < 0)
1537 constraint
= s
->elems
[j
].constraint
;
1538 if (SUCCEEDS_IN_CONTEXT(constraint
, newline
, 0, letter
, 0)
1539 || SUCCEEDS_IN_CONTEXT(constraint
, newline
, 0, letter
, 1)
1540 || SUCCEEDS_IN_CONTEXT(constraint
, newline
, 1, letter
, 0)
1541 || SUCCEEDS_IN_CONTEXT(constraint
, newline
, 1, letter
, 1))
1542 d
->states
[i
].constraint
|= constraint
;
1543 if (! d
->states
[i
].first_end
)
1544 d
->states
[i
].first_end
= d
->tokens
[s
->elems
[j
].index
];
1546 else if (d
->tokens
[s
->elems
[j
].index
] == BACKREF
)
1548 d
->states
[i
].constraint
= NO_CONSTRAINT
;
1549 d
->states
[i
].backref
= 1;
1557 /* Find the epsilon closure of a set of positions. If any position of the set
1558 contains a symbol that matches the empty string in some context, replace
1559 that position with the elements of its follow labeled with an appropriate
1560 constraint. Repeat exhaustively until no funny positions are left.
1561 S->elems must be large enough to hold the result. */
1563 epsclosure (position_set
*s
, struct dfa
const *d
)
1566 char *visited
; /* array of booleans, enough to use char, not int */
1569 CALLOC(visited
, char, d
->tindex
);
1571 for (i
= 0; i
< s
->nelem
; ++i
)
1572 if (d
->tokens
[s
->elems
[i
].index
] >= NOTCHAR
1573 && d
->tokens
[s
->elems
[i
].index
] != BACKREF
1575 && d
->tokens
[s
->elems
[i
].index
] != ANYCHAR
1576 && d
->tokens
[s
->elems
[i
].index
] != MBCSET
1578 && d
->tokens
[s
->elems
[i
].index
] < CSET
)
1581 p
.constraint
= old
.constraint
;
1582 delete(s
->elems
[i
], s
);
1583 if (visited
[old
.index
])
1588 visited
[old
.index
] = 1;
1589 switch (d
->tokens
[old
.index
])
1592 p
.constraint
&= BEGLINE_CONSTRAINT
;
1595 p
.constraint
&= ENDLINE_CONSTRAINT
;
1598 p
.constraint
&= BEGWORD_CONSTRAINT
;
1601 p
.constraint
&= ENDWORD_CONSTRAINT
;
1604 p
.constraint
&= LIMWORD_CONSTRAINT
;
1607 p
.constraint
&= NOTLIMWORD_CONSTRAINT
;
1612 for (j
= 0; j
< d
->follows
[old
.index
].nelem
; ++j
)
1614 p
.index
= d
->follows
[old
.index
].elems
[j
].index
;
1617 /* Force rescan to start at the beginning. */
1624 /* Perform bottom-up analysis on the parse tree, computing various functions.
1625 Note that at this point, we're pretending constructs like \< are real
1626 characters rather than constraints on what can follow them.
1628 Nullable: A node is nullable if it is at the root of a regexp that can
1629 match the empty string.
1630 * EMPTY leaves are nullable.
1631 * No other leaf is nullable.
1632 * A QMARK or STAR node is nullable.
1633 * A PLUS node is nullable if its argument is nullable.
1634 * A CAT node is nullable if both its arguments are nullable.
1635 * An OR node is nullable if either argument is nullable.
1637 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1638 that could correspond to the first character of a string matching the
1639 regexp rooted at the given node.
1640 * EMPTY leaves have empty firstpos.
1641 * The firstpos of a nonempty leaf is that leaf itself.
1642 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1644 * The firstpos of a CAT node is the firstpos of the left argument, union
1645 the firstpos of the right if the left argument is nullable.
1646 * The firstpos of an OR node is the union of firstpos of each argument.
1648 Lastpos: The lastpos of a node is the set of positions that could
1649 correspond to the last character of a string matching the regexp at
1651 * EMPTY leaves have empty lastpos.
1652 * The lastpos of a nonempty leaf is that leaf itself.
1653 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1655 * The lastpos of a CAT node is the lastpos of its right argument, union
1656 the lastpos of the left if the right argument is nullable.
1657 * The lastpos of an OR node is the union of the lastpos of each argument.
1659 Follow: The follow of a position is the set of positions that could
1660 correspond to the character following a character matching the node in
1661 a string matching the regexp. At this point we consider special symbols
1662 that match the empty string in some context to be just normal characters.
1663 Later, if we find that a special symbol is in a follow set, we will
1664 replace it with the elements of its follow, labeled with an appropriate
1666 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1667 the follow of every node in the lastpos.
1668 * Every node in the firstpos of the second argument of a CAT node is in
1669 the follow of every node in the lastpos of the first argument.
1671 Because of the postfix representation of the parse tree, the depth-first
1672 analysis is conveniently done by a linear scan with the aid of a stack.
1673 Sets are stored as arrays of the elements, obeying a stack-like allocation
1674 scheme; the number of elements in each set deeper in the stack can be
1675 used to determine the address of a particular set's array. */
1677 dfaanalyze (struct dfa
*d
, int searchflag
)
1679 int *nullable
; /* Nullable stack. */
1680 int *nfirstpos
; /* Element count stack for firstpos sets. */
1681 position
*firstpos
; /* Array where firstpos elements are stored. */
1682 int *nlastpos
; /* Element count stack for lastpos sets. */
1683 position
*lastpos
; /* Array where lastpos elements are stored. */
1684 int *nalloc
; /* Sizes of arrays allocated to follow sets. */
1685 position_set tmp
; /* Temporary set for merging sets. */
1686 position_set merged
; /* Result of merging sets. */
1687 int wants_newline
; /* True if some position wants newline info. */
1689 int *o_nfirst
, *o_nlast
;
1690 position
*o_firstpos
, *o_lastpos
;
1695 fprintf(stderr
, "dfaanalyze:\n");
1696 for (i
= 0; i
< d
->tindex
; ++i
)
1698 fprintf(stderr
, " %d:", i
);
1699 prtok(d
->tokens
[i
]);
1704 d
->searchflag
= searchflag
;
1706 MALLOC(nullable
, int, d
->depth
);
1707 o_nullable
= nullable
;
1708 MALLOC(nfirstpos
, int, d
->depth
);
1709 o_nfirst
= nfirstpos
;
1710 MALLOC(firstpos
, position
, d
->nleaves
);
1711 o_firstpos
= firstpos
, firstpos
+= d
->nleaves
;
1712 MALLOC(nlastpos
, int, d
->depth
);
1714 MALLOC(lastpos
, position
, d
->nleaves
);
1715 o_lastpos
= lastpos
, lastpos
+= d
->nleaves
;
1716 CALLOC(nalloc
, int, d
->tindex
);
1717 MALLOC(merged
.elems
, position
, d
->nleaves
);
1719 CALLOC(d
->follows
, position_set
, d
->tindex
);
1721 for (i
= 0; i
< d
->tindex
; ++i
)
1723 { /* Nonsyntactic #ifdef goo... */
1725 switch (d
->tokens
[i
])
1728 /* The empty set is nullable. */
1731 /* The firstpos and lastpos of the empty leaf are both empty. */
1732 *nfirstpos
++ = *nlastpos
++ = 0;
1737 /* Every element in the firstpos of the argument is in the follow
1738 of every element in the lastpos. */
1739 tmp
.nelem
= nfirstpos
[-1];
1740 tmp
.elems
= firstpos
;
1742 for (j
= 0; j
< nlastpos
[-1]; ++j
)
1744 merge(&tmp
, &d
->follows
[pos
[j
].index
], &merged
);
1745 REALLOC_IF_NECESSARY(d
->follows
[pos
[j
].index
].elems
, position
,
1746 nalloc
[pos
[j
].index
], merged
.nelem
- 1);
1747 copy(&merged
, &d
->follows
[pos
[j
].index
]);
1751 /* A QMARK or STAR node is automatically nullable. */
1752 if (d
->tokens
[i
] != PLUS
)
1757 /* Every element in the firstpos of the second argument is in the
1758 follow of every element in the lastpos of the first argument. */
1759 tmp
.nelem
= nfirstpos
[-1];
1760 tmp
.elems
= firstpos
;
1761 pos
= lastpos
+ nlastpos
[-1];
1762 for (j
= 0; j
< nlastpos
[-2]; ++j
)
1764 merge(&tmp
, &d
->follows
[pos
[j
].index
], &merged
);
1765 REALLOC_IF_NECESSARY(d
->follows
[pos
[j
].index
].elems
, position
,
1766 nalloc
[pos
[j
].index
], merged
.nelem
- 1);
1767 copy(&merged
, &d
->follows
[pos
[j
].index
]);
1770 /* The firstpos of a CAT node is the firstpos of the first argument,
1771 union that of the second argument if the first is nullable. */
1773 nfirstpos
[-2] += nfirstpos
[-1];
1775 firstpos
+= nfirstpos
[-1];
1778 /* The lastpos of a CAT node is the lastpos of the second argument,
1779 union that of the first argument if the second is nullable. */
1781 nlastpos
[-2] += nlastpos
[-1];
1784 pos
= lastpos
+ nlastpos
[-2];
1785 for (j
= nlastpos
[-1] - 1; j
>= 0; --j
)
1786 pos
[j
] = lastpos
[j
];
1787 lastpos
+= nlastpos
[-2];
1788 nlastpos
[-2] = nlastpos
[-1];
1792 /* A CAT node is nullable if both arguments are nullable. */
1793 nullable
[-2] = nullable
[-1] && nullable
[-2];
1799 /* The firstpos is the union of the firstpos of each argument. */
1800 nfirstpos
[-2] += nfirstpos
[-1];
1803 /* The lastpos is the union of the lastpos of each argument. */
1804 nlastpos
[-2] += nlastpos
[-1];
1807 /* An OR node is nullable if either argument is nullable. */
1808 nullable
[-2] = nullable
[-1] || nullable
[-2];
1813 /* Anything else is a nonempty position. (Note that special
1814 constructs like \< are treated as nonempty strings here;
1815 an "epsilon closure" effectively makes them nullable later.
1816 Backreferences have to get a real position so we can detect
1817 transitions on them later. But they are nullable. */
1818 *nullable
++ = d
->tokens
[i
] == BACKREF
;
1820 /* This position is in its own firstpos and lastpos. */
1821 *nfirstpos
++ = *nlastpos
++ = 1;
1822 --firstpos
, --lastpos
;
1823 firstpos
->index
= lastpos
->index
= i
;
1824 firstpos
->constraint
= lastpos
->constraint
= NO_CONSTRAINT
;
1826 /* Allocate the follow set for this position. */
1828 MALLOC(d
->follows
[i
].elems
, position
, nalloc
[i
]);
1832 /* ... balance the above nonsyntactic #ifdef goo... */
1833 fprintf(stderr
, "node %d:", i
);
1834 prtok(d
->tokens
[i
]);
1836 fprintf(stderr
, nullable
[-1] ? " nullable: yes\n" : " nullable: no\n");
1837 fprintf(stderr
, " firstpos:");
1838 for (j
= nfirstpos
[-1] - 1; j
>= 0; --j
)
1840 fprintf(stderr
, " %d:", firstpos
[j
].index
);
1841 prtok(d
->tokens
[firstpos
[j
].index
]);
1843 fprintf(stderr
, "\n lastpos:");
1844 for (j
= nlastpos
[-1] - 1; j
>= 0; --j
)
1846 fprintf(stderr
, " %d:", lastpos
[j
].index
);
1847 prtok(d
->tokens
[lastpos
[j
].index
]);
1853 /* For each follow set that is the follow set of a real position, replace
1854 it with its epsilon closure. */
1855 for (i
= 0; i
< d
->tindex
; ++i
)
1856 if (d
->tokens
[i
] < NOTCHAR
|| d
->tokens
[i
] == BACKREF
1858 || d
->tokens
[i
] == ANYCHAR
1859 || d
->tokens
[i
] == MBCSET
1861 || d
->tokens
[i
] >= CSET
)
1864 fprintf(stderr
, "follows(%d:", i
);
1865 prtok(d
->tokens
[i
]);
1866 fprintf(stderr
, "):");
1867 for (j
= d
->follows
[i
].nelem
- 1; j
>= 0; --j
)
1869 fprintf(stderr
, " %d:", d
->follows
[i
].elems
[j
].index
);
1870 prtok(d
->tokens
[d
->follows
[i
].elems
[j
].index
]);
1874 copy(&d
->follows
[i
], &merged
);
1875 epsclosure(&merged
, d
);
1876 if (d
->follows
[i
].nelem
< merged
.nelem
)
1877 REALLOC(d
->follows
[i
].elems
, position
, merged
.nelem
);
1878 copy(&merged
, &d
->follows
[i
]);
1881 /* Get the epsilon closure of the firstpos of the regexp. The result will
1882 be the set of positions of state 0. */
1884 for (i
= 0; i
< nfirstpos
[-1]; ++i
)
1885 insert(firstpos
[i
], &merged
);
1886 epsclosure(&merged
, d
);
1888 /* Check if any of the positions of state 0 will want newline context. */
1890 for (i
= 0; i
< merged
.nelem
; ++i
)
1891 if (PREV_NEWLINE_DEPENDENT(merged
.elems
[i
].constraint
))
1894 /* Build the initial state. */
1897 MALLOC(d
->states
, dfa_state
, d
->salloc
);
1898 state_index(d
, &merged
, wants_newline
, 0);
1909 /* Find, for each character, the transition out of state s of d, and store
1910 it in the appropriate slot of trans.
1912 We divide the positions of s into groups (positions can appear in more
1913 than one group). Each group is labeled with a set of characters that
1914 every position in the group matches (taking into account, if necessary,
1915 preceding context information of s). For each group, find the union
1916 of the its elements' follows. This set is the set of positions of the
1917 new state. For each character in the group's label, set the transition
1918 on this character to be to a state corresponding to the set's positions,
1919 and its associated backward context information, if necessary.
1921 If we are building a searching matcher, we include the positions of state
1924 The collection of groups is constructed by building an equivalence-class
1925 partition of the positions of s.
1927 For each position, find the set of characters C that it matches. Eliminate
1928 any characters from C that fail on grounds of backward context.
1930 Search through the groups, looking for a group whose label L has nonempty
1931 intersection with C. If L - C is nonempty, create a new group labeled
1932 L - C and having the same positions as the current group, and set L to
1933 the intersection of L and C. Insert the position in this group, set
1934 C = C - L, and resume scanning.
1936 If after comparing with every group there are characters remaining in C,
1937 create a new group labeled with the characters of C and insert this
1938 position in that group. */
1940 dfastate (int s
, struct dfa
*d
, int trans
[])
1942 position_set grps
[NOTCHAR
]; /* As many as will ever be needed. */
1943 charclass labels
[NOTCHAR
]; /* Labels corresponding to the groups. */
1944 int ngrps
= 0; /* Number of groups actually used. */
1945 position pos
; /* Current position being considered. */
1946 charclass matches
; /* Set of matching characters. */
1947 int matchesf
; /* True if matches is nonempty. */
1948 charclass intersect
; /* Intersection with some label set. */
1949 int intersectf
; /* True if intersect is nonempty. */
1950 charclass leftovers
; /* Stuff in the label that didn't match. */
1951 int leftoversf
; /* True if leftovers is nonempty. */
1952 static charclass letters
; /* Set of characters considered letters. */
1953 static charclass newline
; /* Set of characters that aren't newline. */
1954 position_set follows
; /* Union of the follows of some group. */
1955 position_set tmp
; /* Temporary space for merging sets. */
1956 int state
; /* New state. */
1957 int wants_newline
; /* New state wants to know newline context. */
1958 int state_newline
; /* New state on a newline transition. */
1959 int wants_letter
; /* New state wants to know letter context. */
1960 int state_letter
; /* New state on a letter transition. */
1961 static int initialized
; /* Flag for static initialization. */
1963 int next_isnt_1st_byte
= 0; /* Flag if we can't add state0. */
1967 /* Initialize the set of letters, if necessary. */
1971 for (i
= 0; i
< NOTCHAR
; ++i
)
1972 if (IS_WORD_CONSTITUENT(i
))
1974 setbit(eolbyte
, newline
);
1979 for (i
= 0; i
< d
->states
[s
].elems
.nelem
; ++i
)
1981 pos
= d
->states
[s
].elems
.elems
[i
];
1982 if (d
->tokens
[pos
.index
] >= 0 && d
->tokens
[pos
.index
] < NOTCHAR
)
1983 setbit(d
->tokens
[pos
.index
], matches
);
1984 else if (d
->tokens
[pos
.index
] >= CSET
)
1985 copyset(d
->charclasses
[d
->tokens
[pos
.index
] - CSET
], matches
);
1987 else if (d
->tokens
[pos
.index
] == ANYCHAR
1988 || d
->tokens
[pos
.index
] == MBCSET
)
1989 /* MB_CUR_MAX > 1 */
1991 /* ANYCHAR and MBCSET must match with a single character, so we
1992 must put it to d->states[s].mbps, which contains the positions
1993 which can match with a single character not a byte. */
1994 if (d
->states
[s
].mbps
.nelem
== 0)
1996 MALLOC(d
->states
[s
].mbps
.elems
, position
,
1997 d
->states
[s
].elems
.nelem
);
1999 insert(pos
, &(d
->states
[s
].mbps
));
2002 #endif /* MBS_SUPPORT */
2006 /* Some characters may need to be eliminated from matches because
2007 they fail in the current context. */
2008 if (pos
.constraint
!= 0xFF)
2010 if (! MATCHES_NEWLINE_CONTEXT(pos
.constraint
,
2011 d
->states
[s
].newline
, 1))
2012 clrbit(eolbyte
, matches
);
2013 if (! MATCHES_NEWLINE_CONTEXT(pos
.constraint
,
2014 d
->states
[s
].newline
, 0))
2015 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
2016 matches
[j
] &= newline
[j
];
2017 if (! MATCHES_LETTER_CONTEXT(pos
.constraint
,
2018 d
->states
[s
].letter
, 1))
2019 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
2020 matches
[j
] &= ~letters
[j
];
2021 if (! MATCHES_LETTER_CONTEXT(pos
.constraint
,
2022 d
->states
[s
].letter
, 0))
2023 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
2024 matches
[j
] &= letters
[j
];
2026 /* If there are no characters left, there's no point in going on. */
2027 for (j
= 0; j
< CHARCLASS_INTS
&& !matches
[j
]; ++j
)
2029 if (j
== CHARCLASS_INTS
)
2033 for (j
= 0; j
< ngrps
; ++j
)
2035 /* If matches contains a single character only, and the current
2036 group's label doesn't contain that character, go on to the
2038 if (d
->tokens
[pos
.index
] >= 0 && d
->tokens
[pos
.index
] < NOTCHAR
2039 && !tstbit(d
->tokens
[pos
.index
], labels
[j
]))
2042 /* Check if this group's label has a nonempty intersection with
2045 for (k
= 0; k
< CHARCLASS_INTS
; ++k
)
2046 (intersect
[k
] = matches
[k
] & labels
[j
][k
]) ? (intersectf
= 1) : 0;
2050 /* It does; now find the set differences both ways. */
2051 leftoversf
= matchesf
= 0;
2052 for (k
= 0; k
< CHARCLASS_INTS
; ++k
)
2054 /* Even an optimizing compiler can't know this for sure. */
2055 int match
= matches
[k
], label
= labels
[j
][k
];
2057 (leftovers
[k
] = ~match
& label
) ? (leftoversf
= 1) : 0;
2058 (matches
[k
] = match
& ~label
) ? (matchesf
= 1) : 0;
2061 /* If there were leftovers, create a new group labeled with them. */
2064 copyset(leftovers
, labels
[ngrps
]);
2065 copyset(intersect
, labels
[j
]);
2066 MALLOC(grps
[ngrps
].elems
, position
, d
->nleaves
);
2067 copy(&grps
[j
], &grps
[ngrps
]);
2071 /* Put the position in the current group. Note that there is no
2072 reason to call insert() here. */
2073 grps
[j
].elems
[grps
[j
].nelem
++] = pos
;
2075 /* If every character matching the current position has been
2076 accounted for, we're done. */
2081 /* If we've passed the last group, and there are still characters
2082 unaccounted for, then we'll have to create a new group. */
2085 copyset(matches
, labels
[ngrps
]);
2087 MALLOC(grps
[ngrps
].elems
, position
, d
->nleaves
);
2088 grps
[ngrps
].nelem
= 1;
2089 grps
[ngrps
].elems
[0] = pos
;
2094 MALLOC(follows
.elems
, position
, d
->nleaves
);
2095 MALLOC(tmp
.elems
, position
, d
->nleaves
);
2097 /* If we are a searching matcher, the default transition is to a state
2098 containing the positions of state 0, otherwise the default transition
2099 is to fail miserably. */
2104 for (i
= 0; i
< d
->states
[0].elems
.nelem
; ++i
)
2106 if (PREV_NEWLINE_DEPENDENT(d
->states
[0].elems
.elems
[i
].constraint
))
2108 if (PREV_LETTER_DEPENDENT(d
->states
[0].elems
.elems
[i
].constraint
))
2111 copy(&d
->states
[0].elems
, &follows
);
2112 state
= state_index(d
, &follows
, 0, 0);
2114 state_newline
= state_index(d
, &follows
, 1, 0);
2116 state_newline
= state
;
2118 state_letter
= state_index(d
, &follows
, 0, 1);
2120 state_letter
= state
;
2121 for (i
= 0; i
< NOTCHAR
; ++i
)
2122 trans
[i
] = (IS_WORD_CONSTITUENT(i
)) ? state_letter
: state
;
2123 trans
[eolbyte
] = state_newline
;
2126 for (i
= 0; i
< NOTCHAR
; ++i
)
2129 for (i
= 0; i
< ngrps
; ++i
)
2133 /* Find the union of the follows of the positions of the group.
2134 This is a hideously inefficient loop. Fix it someday. */
2135 for (j
= 0; j
< grps
[i
].nelem
; ++j
)
2136 for (k
= 0; k
< d
->follows
[grps
[i
].elems
[j
].index
].nelem
; ++k
)
2137 insert(d
->follows
[grps
[i
].elems
[j
].index
].elems
[k
], &follows
);
2140 if (d
->mb_cur_max
> 1)
2142 /* If a token in follows.elems is not 1st byte of a multibyte
2143 character, or the states of follows must accept the bytes
2144 which are not 1st byte of the multibyte character.
2145 Then, if a state of follows encounter a byte, it must not be
2146 a 1st byte of a multibyte character nor single byte character.
2147 We cansel to add state[0].follows to next state, because
2148 state[0] must accept 1st-byte
2150 For example, we assume <sb a> is a certain single byte
2151 character, <mb A> is a certain multibyte character, and the
2152 codepoint of <sb a> equals the 2nd byte of the codepoint of
2154 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2155 by accepting accepts 1st byte of <mb A>, and state[i+1]
2156 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2157 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2158 <mb A>, so we can not add state[0]. */
2160 next_isnt_1st_byte
= 0;
2161 for (j
= 0; j
< follows
.nelem
; ++j
)
2163 if (!(d
->multibyte_prop
[follows
.elems
[j
].index
] & 1))
2165 next_isnt_1st_byte
= 1;
2172 /* If we are building a searching matcher, throw in the positions
2173 of state 0 as well. */
2175 if (d
->searchflag
&& (d
->mb_cur_max
== 1 || !next_isnt_1st_byte
))
2179 for (j
= 0; j
< d
->states
[0].elems
.nelem
; ++j
)
2180 insert(d
->states
[0].elems
.elems
[j
], &follows
);
2182 /* Find out if the new state will want any context information. */
2184 if (tstbit(eolbyte
, labels
[i
]))
2185 for (j
= 0; j
< follows
.nelem
; ++j
)
2186 if (PREV_NEWLINE_DEPENDENT(follows
.elems
[j
].constraint
))
2190 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
2191 if (labels
[i
][j
] & letters
[j
])
2193 if (j
< CHARCLASS_INTS
)
2194 for (j
= 0; j
< follows
.nelem
; ++j
)
2195 if (PREV_LETTER_DEPENDENT(follows
.elems
[j
].constraint
))
2198 /* Find the state(s) corresponding to the union of the follows. */
2199 state
= state_index(d
, &follows
, 0, 0);
2201 state_newline
= state_index(d
, &follows
, 1, 0);
2203 state_newline
= state
;
2205 state_letter
= state_index(d
, &follows
, 0, 1);
2207 state_letter
= state
;
2209 /* Set the transitions for each character in the current label. */
2210 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
2211 for (k
= 0; k
< INTBITS
; ++k
)
2212 if (labels
[i
][j
] & 1 << k
)
2214 int c
= j
* INTBITS
+ k
;
2217 trans
[c
] = state_newline
;
2218 else if (IS_WORD_CONSTITUENT(c
))
2219 trans
[c
] = state_letter
;
2220 else if (c
< NOTCHAR
)
2225 for (i
= 0; i
< ngrps
; ++i
)
2226 free(grps
[i
].elems
);
2227 free(follows
.elems
);
2231 /* Some routines for manipulating a compiled dfa's transition tables.
2232 Each state may or may not have a transition table; if it does, and it
2233 is a non-accepting state, then d->trans[state] points to its table.
2234 If it is an accepting state then d->fails[state] points to its table.
2235 If it has no table at all, then d->trans[state] is NULL.
2236 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2239 build_state (int s
, struct dfa
*d
)
2241 int *trans
; /* The new transition table. */
2244 /* Set an upper limit on the number of transition tables that will ever
2245 exist at once. 1024 is arbitrary. The idea is that the frequently
2246 used transition tables will be quickly rebuilt, whereas the ones that
2247 were only needed once or twice will be cleared away. */
2248 if (d
->trcount
>= 1024)
2250 for (i
= 0; i
< d
->tralloc
; ++i
)
2254 d
->trans
[i
] = d
->fails
[i
] = NULL
;
2261 /* Set up the success bits for this state. */
2263 if (ACCEPTS_IN_CONTEXT(d
->states
[s
].newline
, 1, d
->states
[s
].letter
, 0,
2266 if (ACCEPTS_IN_CONTEXT(d
->states
[s
].newline
, 0, d
->states
[s
].letter
, 1,
2269 if (ACCEPTS_IN_CONTEXT(d
->states
[s
].newline
, 0, d
->states
[s
].letter
, 0,
2273 MALLOC(trans
, int, NOTCHAR
);
2274 dfastate(s
, d
, trans
);
2276 /* Now go through the new transition table, and make sure that the trans
2277 and fail arrays are allocated large enough to hold a pointer for the
2278 largest state mentioned in the table. */
2279 for (i
= 0; i
< NOTCHAR
; ++i
)
2280 if (trans
[i
] >= d
->tralloc
)
2282 int oldalloc
= d
->tralloc
;
2284 while (trans
[i
] >= d
->tralloc
)
2286 REALLOC(d
->realtrans
, int *, d
->tralloc
+ 1);
2287 d
->trans
= d
->realtrans
+ 1;
2288 REALLOC(d
->fails
, int *, d
->tralloc
);
2289 REALLOC(d
->success
, int, d
->tralloc
);
2290 REALLOC(d
->newlines
, int, d
->tralloc
);
2291 while (oldalloc
< d
->tralloc
)
2293 d
->trans
[oldalloc
] = NULL
;
2294 d
->fails
[oldalloc
++] = NULL
;
2298 /* Keep the newline transition in a special place so we can use it as
2300 d
->newlines
[s
] = trans
[eolbyte
];
2301 trans
[eolbyte
] = -1;
2303 if (ACCEPTING(s
, *d
))
2304 d
->fails
[s
] = trans
;
2306 d
->trans
[s
] = trans
;
2310 build_state_zero (struct dfa
*d
)
2314 CALLOC(d
->realtrans
, int *, d
->tralloc
+ 1);
2315 d
->trans
= d
->realtrans
+ 1;
2316 CALLOC(d
->fails
, int *, d
->tralloc
);
2317 MALLOC(d
->success
, int, d
->tralloc
);
2318 MALLOC(d
->newlines
, int, d
->tralloc
);
2323 /* Multibyte character handling sub-routines for dfaexec. */
2325 /* Initial state may encounter the byte which is not a single byte character
2326 nor 1st byte of a multibyte character. But it is incorrect for initial
2327 state to accept such a byte.
2328 For example, in sjis encoding the regular expression like "\\" accepts
2329 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2330 0x815c. Then Initial state must skip the bytes which are not a single byte
2331 character nor 1st byte of a multibyte character. */
2332 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2335 while (inputwcs[p - buf_begin] == 0 \
2336 && mblen_buf[p - buf_begin] > 0 \
2337 && (unsigned char const *) p < buf_end) \
2339 if ((char *) p >= end) \
2349 realloc_trans_if_necessary(struct dfa
*d
, int new_state
)
2351 /* Make sure that the trans and fail arrays are allocated large enough
2352 to hold a pointer for the new state. */
2353 if (new_state
>= d
->tralloc
)
2355 int oldalloc
= d
->tralloc
;
2357 while (new_state
>= d
->tralloc
)
2359 REALLOC(d
->realtrans
, int *, d
->tralloc
+ 1);
2360 d
->trans
= d
->realtrans
+ 1;
2361 REALLOC(d
->fails
, int *, d
->tralloc
);
2362 REALLOC(d
->success
, int, d
->tralloc
);
2363 REALLOC(d
->newlines
, int, d
->tralloc
);
2364 while (oldalloc
< d
->tralloc
)
2366 d
->trans
[oldalloc
] = NULL
;
2367 d
->fails
[oldalloc
++] = NULL
;
2372 /* Return values of transit_state_singlebyte(), and
2373 transit_state_consume_1char. */
2376 TRANSIT_STATE_IN_PROGRESS
, /* State transition has not finished. */
2377 TRANSIT_STATE_DONE
, /* State transition has finished. */
2378 TRANSIT_STATE_END_BUFFER
/* Reach the end of the buffer. */
2379 } status_transit_state
;
2381 /* Consume a single byte and transit state from 's' to '*next_state'.
2382 This function is almost same as the state transition routin in dfaexec().
2383 But state transition is done just once, otherwise matching succeed or
2384 reach the end of the buffer. */
2385 static status_transit_state
2386 transit_state_singlebyte (struct dfa
*d
, int s
, unsigned char const *p
,
2392 status_transit_state rval
= TRANSIT_STATE_IN_PROGRESS
;
2394 while (rval
== TRANSIT_STATE_IN_PROGRESS
)
2396 if ((t
= d
->trans
[works
]) != NULL
)
2399 rval
= TRANSIT_STATE_DONE
;
2407 /* At the moment, it must not happen. */
2412 else if (d
->fails
[works
])
2414 works
= d
->fails
[works
][*p
];
2415 rval
= TRANSIT_STATE_DONE
;
2419 build_state(works
, d
);
2422 *next_state
= works
;
2426 /* Check whether period can match or not in the current context. If it can,
2427 return the amount of the bytes with which period can match, otherwise
2429 `pos' is the position of the period. `idx' is the index from the
2430 buf_begin, and it is the current position in the buffer. */
2432 match_anychar (struct dfa
*d
, int s
, position pos
, int idx
)
2440 mbclen
= (mblen_buf
[idx
] == 0)? 1 : mblen_buf
[idx
];
2442 /* Check context. */
2443 if (wc
== (wchar_t)eolbyte
)
2445 if (!(syntax_bits
& RE_DOT_NEWLINE
))
2449 else if (wc
== (wchar_t)'\0')
2451 if (syntax_bits
& RE_DOT_NOT_NULL
)
2456 if (iswalnum(wc
) || wc
== L
'_')
2459 if (!SUCCEEDS_IN_CONTEXT(pos
.constraint
, d
->states
[s
].newline
,
2460 newline
, d
->states
[s
].letter
, letter
))
2466 /* Check whether bracket expression can match or not in the current context.
2467 If it can, return the amount of the bytes with which expression can match,
2469 `pos' is the position of the bracket expression. `idx' is the index
2470 from the buf_begin, and it is the current position in the buffer. */
2472 match_mb_charset (struct dfa
*d
, int s
, position pos
, int idx
)
2475 int match
; /* Flag which represent that matching succeed. */
2476 int match_len
; /* Length of the character (or collating element)
2477 with which this operator match. */
2478 int op_len
; /* Length of the operator. */
2482 /* Pointer to the structure to which we are currently refering. */
2483 struct mb_char_classes
*work_mbc
;
2487 wchar_t wc
; /* Current refering character. */
2491 /* Check context. */
2492 if (wc
== (wchar_t)eolbyte
)
2494 if (!(syntax_bits
& RE_DOT_NEWLINE
))
2498 else if (wc
== (wchar_t)'\0')
2500 if (syntax_bits
& RE_DOT_NOT_NULL
)
2504 if (iswalnum(wc
) || wc
== L
'_')
2506 if (!SUCCEEDS_IN_CONTEXT(pos
.constraint
, d
->states
[s
].newline
,
2507 newline
, d
->states
[s
].letter
, letter
))
2510 /* Assign the current refering operator to work_mbc. */
2511 work_mbc
= &(d
->mbcsets
[(d
->multibyte_prop
[pos
.index
]) >> 2]);
2512 match
= !work_mbc
->invert
;
2513 match_len
= (mblen_buf
[idx
] == 0)? 1 : mblen_buf
[idx
];
2515 /* Match in range 0-255? */
2516 if (wc
< NOTCHAR
&& work_mbc
->cset
!= -1
2517 && tstbit((unsigned char)wc
, d
->charclasses
[work_mbc
->cset
]))
2518 goto charset_matched
;
2520 /* match with a character class? */
2521 for (i
= 0; i
<work_mbc
->nch_classes
; i
++)
2523 if (iswctype((wint_t)wc
, work_mbc
->ch_classes
[i
]))
2524 goto charset_matched
;
2527 strncpy(buffer
, (char const *) buf_begin
+ idx
, match_len
);
2528 buffer
[match_len
] = '\0';
2530 /* match with an equivalent class? */
2531 for (i
= 0; i
<work_mbc
->nequivs
; i
++)
2533 op_len
= strlen(work_mbc
->equivs
[i
]);
2534 strncpy(buffer
, (char const *) buf_begin
+ idx
, op_len
);
2535 buffer
[op_len
] = '\0';
2536 if (strcoll(work_mbc
->equivs
[i
], buffer
) == 0)
2539 goto charset_matched
;
2543 /* match with a collating element? */
2544 for (i
= 0; i
<work_mbc
->ncoll_elems
; i
++)
2546 op_len
= strlen(work_mbc
->coll_elems
[i
]);
2547 strncpy(buffer
, (char const *) buf_begin
+ idx
, op_len
);
2548 buffer
[op_len
] = '\0';
2550 if (strcoll(work_mbc
->coll_elems
[i
], buffer
) == 0)
2553 goto charset_matched
;
2558 wcbuf
[1] = wcbuf
[3] = wcbuf
[5] = '\0';
2560 /* match with a range? */
2561 for (i
= 0; i
<work_mbc
->nranges
; i
++)
2563 wcbuf
[2] = work_mbc
->range_sts
[i
];
2564 wcbuf
[4] = work_mbc
->range_ends
[i
];
2566 if (wcscoll(wcbuf
, wcbuf
+2) >= 0 &&
2567 wcscoll(wcbuf
+4, wcbuf
) >= 0)
2568 goto charset_matched
;
2571 /* match with a character? */
2572 for (i
= 0; i
<work_mbc
->nchars
; i
++)
2574 if (wc
== work_mbc
->chars
[i
])
2575 goto charset_matched
;
2581 return match
? match_len
: 0;
2584 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2585 array which corresponds to `d->states[s].mbps.elem' and each element of
2586 the array contains the amount of the bytes with which the element can
2588 `idx' is the index from the buf_begin, and it is the current position
2590 Caller MUST free the array which this function return. */
2592 check_matching_with_multibyte_ops (struct dfa
*d
, int s
, int idx
)
2597 MALLOC(rarray
, int, d
->states
[s
].mbps
.nelem
);
2598 for (i
= 0; i
< d
->states
[s
].mbps
.nelem
; ++i
)
2600 position pos
= d
->states
[s
].mbps
.elems
[i
];
2601 switch(d
->tokens
[pos
.index
])
2604 rarray
[i
] = match_anychar(d
, s
, pos
, idx
);
2607 rarray
[i
] = match_mb_charset(d
, s
, pos
, idx
);
2610 break; /* can not happen. */
2616 /* Consume a single character and enumerate all of the positions which can
2617 be next position from the state `s'.
2618 `match_lens' is the input. It can be NULL, but it can also be the output
2619 of check_matching_with_multibyte_ops() for optimization.
2620 `mbclen' and `pps' are the output. `mbclen' is the length of the
2621 character consumed, and `pps' is the set this function enumerate. */
2622 static status_transit_state
2623 transit_state_consume_1char (struct dfa
*d
, int s
, unsigned char const **pp
,
2624 int *match_lens
, int *mbclen
, position_set
*pps
)
2629 status_transit_state rs
= TRANSIT_STATE_DONE
;
2631 /* Calculate the length of the (single/multi byte) character
2632 to which p points. */
2633 *mbclen
= (mblen_buf
[*pp
- buf_begin
] == 0)? 1
2634 : mblen_buf
[*pp
- buf_begin
];
2636 /* Calculate the state which can be reached from the state `s' by
2637 consuming `*mbclen' single bytes from the buffer. */
2639 for (i
= 0; i
< *mbclen
; i
++)
2642 rs
= transit_state_singlebyte(d
, s2
, (*pp
)++, &s1
);
2644 /* Copy the positions contained by `s1' to the set `pps'. */
2645 copy(&(d
->states
[s1
].elems
), pps
);
2647 /* Check (inputed)match_lens, and initialize if it is NULL. */
2648 if (match_lens
== NULL
&& d
->states
[s
].mbps
.nelem
!= 0)
2649 work_mbls
= check_matching_with_multibyte_ops(d
, s
, *pp
- buf_begin
);
2651 work_mbls
= match_lens
;
2653 /* Add all of the positions which can be reached from `s' by consuming
2654 a single character. */
2655 for (i
= 0; i
< d
->states
[s
].mbps
.nelem
; i
++)
2657 if (work_mbls
[i
] == *mbclen
)
2658 for (j
= 0; j
< d
->follows
[d
->states
[s
].mbps
.elems
[i
].index
].nelem
;
2660 insert(d
->follows
[d
->states
[s
].mbps
.elems
[i
].index
].elems
[j
],
2664 if (match_lens
== NULL
&& work_mbls
!= NULL
)
2669 /* Transit state from s, then return new state and update the pointer of the
2670 buffer. This function is for some operator which can match with a multi-
2671 byte character or a collating element (which may be multi characters). */
2673 transit_state (struct dfa
*d
, int s
, unsigned char const **pp
)
2676 int mbclen
; /* The length of current input multibyte character. */
2679 int *match_lens
= NULL
;
2680 int nelem
= d
->states
[s
].mbps
.nelem
; /* Just a alias. */
2681 position_set follows
;
2682 unsigned char const *p1
= *pp
;
2686 /* This state has (a) multibyte operator(s).
2687 We check whether each of them can match or not. */
2689 /* Note: caller must free the return value of this function. */
2690 match_lens
= check_matching_with_multibyte_ops(d
, s
, *pp
- buf_begin
);
2692 for (i
= 0; i
< nelem
; i
++)
2693 /* Search the operator which match the longest string,
2696 if (match_lens
[i
] > maxlen
)
2697 maxlen
= match_lens
[i
];
2701 if (nelem
== 0 || maxlen
== 0)
2702 /* This state has no multibyte operator which can match.
2703 We need to check only one single byte character. */
2705 status_transit_state rs
;
2706 rs
= transit_state_singlebyte(d
, s
, *pp
, &s1
);
2708 /* We must update the pointer if state transition succeeded. */
2709 if (rs
== TRANSIT_STATE_DONE
)
2716 /* This state has some operators which can match a multibyte character. */
2718 MALLOC(follows
.elems
, position
, d
->nleaves
);
2720 /* `maxlen' may be longer than the length of a character, because it may
2721 not be a character but a (multi character) collating element.
2722 We enumerate all of the positions which `s' can reach by consuming
2724 transit_state_consume_1char(d
, s
, pp
, match_lens
, &mbclen
, &follows
);
2726 wc
= inputwcs
[*pp
- mbclen
- buf_begin
];
2727 s1
= state_index(d
, &follows
, wc
== L
'\n', iswalnum(wc
));
2728 realloc_trans_if_necessary(d
, s1
);
2730 while (*pp
- p1
< maxlen
)
2733 transit_state_consume_1char(d
, s1
, pp
, NULL
, &mbclen
, &follows
);
2735 for (i
= 0; i
< nelem
; i
++)
2737 if (match_lens
[i
] == *pp
- p1
)
2739 j
< d
->follows
[d
->states
[s1
].mbps
.elems
[i
].index
].nelem
; j
++)
2740 insert(d
->follows
[d
->states
[s1
].mbps
.elems
[i
].index
].elems
[j
],
2744 wc
= inputwcs
[*pp
- mbclen
- buf_begin
];
2745 s1
= state_index(d
, &follows
, wc
== L
'\n', iswalnum(wc
));
2746 realloc_trans_if_necessary(d
, s1
);
2749 free(follows
.elems
);
2753 #endif /* MBS_SUPPORT */
2755 /* Search through a buffer looking for a match to the given struct dfa.
2756 Find the first occurrence of a string matching the regexp in the
2757 buffer, and the shortest possible version thereof. Return a pointer to
2758 the first character after the match, or NULL if none is found. BEGIN
2759 points to the beginning of the buffer, and END points to the first byte
2760 after its end. Note however that we store a sentinel byte (usually
2761 newline) in *END, so the actual buffer must be one byte longer.
2762 When NEWLINE is nonzero, newlines may appear in the matching string.
2763 If COUNT is non-NULL, increment *COUNT once for each newline processed.
2764 Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
2765 encountered a back-reference (1) or not (0). The caller may use this
2766 to decide whether to fall back on a backtracking matcher. */
2768 dfaexec (struct dfa
*d
, char const *begin
, char *end
,
2769 int newline
, int *count
, int *backref
)
2771 int s
, s1
, tmp
; /* Current state. */
2772 unsigned char const *p
; /* Current input character. */
2773 int **trans
, *t
; /* Copy of d->trans so it can be optimized
2775 unsigned char eol
= eolbyte
; /* Likewise for eolbyte. */
2776 static int sbit
[NOTCHAR
]; /* Table for anding with d->success. */
2777 static int sbit_init
;
2784 for (i
= 0; i
< NOTCHAR
; ++i
)
2785 sbit
[i
] = (IS_WORD_CONSTITUENT(i
)) ? 2 : 1;
2790 build_state_zero(d
);
2793 p
= (unsigned char const *) begin
;
2795 unsigned char saved_end
= *(unsigned char *) end
;
2799 if (d
->mb_cur_max
> 1)
2801 int remain_bytes
, i
;
2802 buf_begin
= (unsigned char *) begin
;
2803 buf_end
= (unsigned char *) end
;
2805 /* initialize mblen_buf, and inputwcs. */
2806 MALLOC(mblen_buf
, unsigned char, end
- begin
+ 2);
2807 MALLOC(inputwcs
, wchar_t, end
- begin
+ 2);
2808 memset(&mbs
, 0, sizeof(mbstate_t));
2810 for (i
= 0; i
< end
- begin
+ 1; i
++)
2812 if (remain_bytes
== 0)
2815 = mbrtowc(inputwcs
+ i
, begin
+ i
, end
- begin
- i
+ 1, &mbs
);
2816 if (remain_bytes
< 1
2817 || (remain_bytes
== 1 && inputwcs
[i
] == (wchar_t)begin
[i
]))
2820 inputwcs
[i
] = (wchar_t)begin
[i
];
2825 mblen_buf
[i
] = remain_bytes
;
2831 mblen_buf
[i
] = remain_bytes
;
2837 inputwcs
[i
] = 0; /* sentinel */
2839 #endif /* MBS_SUPPORT */
2844 if (d
->mb_cur_max
> 1)
2845 while ((t
= trans
[s
]))
2847 if ((char *) p
> end
)
2850 SKIP_REMAINS_MB_IF_INITIAL_STATE(s
, p
);
2852 if (d
->states
[s
].mbps
.nelem
== 0)
2858 /* Can match with a multibyte character (and multi character
2859 collating element). Transition table might be updated. */
2860 s
= transit_state(d
, s
, &p
);
2864 #endif /* MBS_SUPPORT */
2865 while ((t
= trans
[s
]) != 0) { /* hand-optimized loop */
2867 if ((t
= trans
[s1
]) == 0) {
2868 tmp
= s
; s
= s1
; s1
= tmp
; /* swap */
2874 if (s
>= 0 && (char *) p
<= end
&& d
->fails
[s
])
2876 if (d
->success
[s
] & sbit
[*p
])
2879 *backref
= (d
->states
[s
].backref
!= 0);
2881 if (d
->mb_cur_max
> 1)
2886 #endif /* MBS_SUPPORT */
2893 if (d
->mb_cur_max
> 1)
2895 /* Can match with a multibyte character (and multicharacter
2896 collating element). Transition table might be updated. */
2897 s
= transit_state(d
, s
, &p
);
2901 #endif /* MBS_SUPPORT */
2902 s
= d
->fails
[s
][*p
++];
2906 /* If the previous character was a newline, count it. */
2907 if (count
&& (char *) p
<= end
&& p
[-1] == eol
)
2910 /* Check if we've run off the end of the buffer. */
2911 if ((char *) p
> end
)
2914 if (d
->mb_cur_max
> 1)
2919 #endif /* MBS_SUPPORT */
2931 if (p
[-1] == eol
&& newline
)
2933 s
= d
->newlines
[s1
];
2943 free_mbdata (struct dfa
*d
)
2947 free(d
->multibyte_prop
);
2948 d
->multibyte_prop
= NULL
;
2950 for (i
= 0; i
< d
->nmbcsets
; ++i
)
2953 struct mb_char_classes
*p
= &(d
->mbcsets
[i
]);
2955 free(p
->ch_classes
);
2957 free(p
->range_ends
);
2959 for (j
= 0; j
< p
->nequivs
; ++j
)
2963 for (j
= 0; j
< p
->ncoll_elems
; ++j
)
2964 free(p
->coll_elems
[j
]);
2965 free(p
->coll_elems
);
2974 /* Initialize the components of a dfa that the other routines don't
2975 initialize for themselves. */
2977 dfainit (struct dfa
*d
)
2980 MALLOC(d
->charclasses
, charclass
, d
->calloc
);
2984 MALLOC(d
->tokens
, token
, d
->talloc
);
2985 d
->tindex
= d
->depth
= d
->nleaves
= d
->nregexps
= 0;
2988 d
->mb_cur_max
= MB_CUR_MAX
;
2989 if (d
->mb_cur_max
> 1)
2991 d
->nmultibyte_prop
= 1;
2992 MALLOC(d
->multibyte_prop
, int, d
->nmultibyte_prop
);
2994 d
->mbcsets_alloc
= 1;
2995 MALLOC(d
->mbcsets
, struct mb_char_classes
, d
->mbcsets_alloc
);
3014 dfaoptimize (struct dfa
*d
)
3020 for (i
= 0; i
< d
->tindex
; ++i
)
3022 switch(d
->tokens
[i
])
3026 /* Requires multi-byte algorithm. */
3038 /* Parse and analyze a single string of the given length. */
3040 dfacomp (char const *s
, size_t len
, struct dfa
*d
, int searchflag
)
3043 dfaparse(s
, len
, d
);
3048 dfaanalyze(d
, searchflag
);
3051 /* Free the storage held by the components of a dfa. */
3053 dfafree (struct dfa
*d
)
3056 struct dfamust
*dm
, *ndm
;
3058 free(d
->charclasses
);
3062 if (d
->mb_cur_max
> 1)
3064 #endif /* MBS_SUPPORT */
3066 for (i
= 0; i
< d
->sindex
; ++i
) {
3067 free(d
->states
[i
].elems
.elems
);
3069 free(d
->states
[i
].mbps
.elems
);
3070 #endif /* MBS_SUPPORT */
3073 for (i
= 0; i
< d
->tindex
; ++i
)
3074 free(d
->follows
[i
].elems
);
3076 for (i
= 0; i
< d
->tralloc
; ++i
)
3085 for (dm
= d
->musts
; dm
; dm
= ndm
)
3093 /* Having found the postfix representation of the regular expression,
3094 try to find a long sequence of characters that must appear in any line
3096 Finding a "longest" sequence is beyond the scope here;
3097 we take an easy way out and hope for the best.
3098 (Take "(ab|a)b"--please.)
3100 We do a bottom-up calculation of sequences of characters that must appear
3101 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3103 sequences that must appear at the left of the match ("left")
3104 sequences that must appear at the right of the match ("right")
3105 lists of sequences that must appear somewhere in the match ("in")
3106 sequences that must constitute the match ("is")
3108 When we get to the root of the tree, we use one of the longest of its
3109 calculated "in" sequences as our answer. The sequence we find is returned in
3110 d->must (where "d" is the single argument passed to "dfamust");
3111 the length of the sequence is returned in d->mustn.
3113 The sequences calculated for the various types of node (in pseudo ANSI c)
3114 are shown below. "p" is the operand of unary operators (and the left-hand
3115 operand of binary operators); "q" is the right-hand operand of binary
3118 "ZERO" means "a zero-length sequence" below.
3120 Type left right is in
3121 ---- ---- ----- -- --
3122 char c # c # c # c # c
3124 ANYCHAR ZERO ZERO ZERO ZERO
3126 MBCSET ZERO ZERO ZERO ZERO
3128 CSET ZERO ZERO ZERO ZERO
3130 STAR ZERO ZERO ZERO ZERO
3132 QMARK ZERO ZERO ZERO ZERO
3134 PLUS p->left p->right ZERO p->in
3136 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3137 p->left : q->right : q->is!=ZERO) ? q->in plus
3138 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3141 OR longest common longest common (do p->is and substrings common to
3142 leading trailing q->is have same p->in and q->in
3143 (sub)sequence (sub)sequence length and
3144 of p->left of p->right content) ?
3145 and q->left and q->right p->is : NULL
3147 If there's anything else we recognize in the tree, all four sequences get set
3148 to zero-length sequences. If there's something we don't recognize in the tree,
3149 we just return a zero-length sequence.
3151 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3154 And. . .is it here or someplace that we might ponder "optimizations" such as
3155 egrep 'psi|epsilon' -> egrep 'psi'
3156 egrep 'pepsi|epsilon' -> egrep 'epsi'
3157 (Yes, we now find "epsi" as a "string
3158 that must occur", but we might also
3159 simplify the *entire* r.e. being sought)
3160 grep '[c]' -> grep 'c'
3161 grep '(ab|a)b' -> grep 'ab'
3162 grep 'ab*' -> grep 'a'
3163 grep 'a*b' -> grep 'b'
3165 There are several issues:
3167 Is optimization easy (enough)?
3169 Does optimization actually accomplish anything,
3170 or is the automaton you get from "psi|epsilon" (for example)
3171 the same as the one you get from "psi" (for example)?
3173 Are optimizable r.e.'s likely to be used in real-life situations
3174 (something like 'ab*' is probably unlikely; something like is
3175 'psi|epsilon' is likelier)? */
3178 icatalloc (char *old
, char *new)
3181 size_t oldsize
, newsize
;
3183 newsize
= (new == NULL
) ? 0 : strlen(new);
3186 else if (newsize
== 0)
3188 else oldsize
= strlen(old
);
3190 result
= (char *) malloc(newsize
+ 1);
3192 result
= (char *) realloc((void *) old
, oldsize
+ newsize
+ 1);
3193 if (result
!= NULL
&& new != NULL
)
3194 (void) strcpy(result
+ oldsize
, new);
3199 icpyalloc (char *string
)
3201 return icatalloc((char *) NULL
, string
);
3205 istrstr (char *lookin
, char *lookfor
)
3210 len
= strlen(lookfor
);
3211 for (cp
= lookin
; *cp
!= '\0'; ++cp
)
3212 if (strncmp(cp
, lookfor
, len
) == 0)
3218 freelist (char **cpp
)
3224 for (i
= 0; cpp
[i
] != NULL
; ++i
)
3232 enlist (char **cpp
, char *new, size_t len
)
3238 if ((new = icpyalloc(new)) == NULL
)
3244 /* Is there already something in the list that's new (or longer)? */
3245 for (i
= 0; cpp
[i
] != NULL
; ++i
)
3246 if (istrstr(cpp
[i
], new) != NULL
)
3251 /* Eliminate any obsoleted strings. */
3253 while (cpp
[j
] != NULL
)
3254 if (istrstr(new, cpp
[j
]) == NULL
)
3264 /* Add the new string. */
3265 cpp
= (char **) realloc((char *) cpp
, (i
+ 2) * sizeof *cpp
);
3273 /* Given pointers to two strings, return a pointer to an allocated
3274 list of their distinct common substrings. Return NULL if something
3277 comsubs (char *left
, char *right
)
3284 if (left
== NULL
|| right
== NULL
)
3286 cpp
= (char **) malloc(sizeof *cpp
);
3290 for (lcp
= left
; *lcp
!= '\0'; ++lcp
)
3293 rcp
= strchr (right
, *lcp
);
3296 for (i
= 1; lcp
[i
] != '\0' && lcp
[i
] == rcp
[i
]; ++i
)
3300 rcp
= strchr (rcp
+ 1, *lcp
);
3304 if ((cpp
= enlist(cpp
, lcp
, len
)) == NULL
)
3311 addlists (char **old
, char **new)
3315 if (old
== NULL
|| new == NULL
)
3317 for (i
= 0; new[i
] != NULL
; ++i
)
3319 old
= enlist(old
, new[i
], strlen(new[i
]));
3326 /* Given two lists of substrings, return a new list giving substrings
3329 inboth (char **left
, char **right
)
3335 if (left
== NULL
|| right
== NULL
)
3337 both
= (char **) malloc(sizeof *both
);
3341 for (lnum
= 0; left
[lnum
] != NULL
; ++lnum
)
3343 for (rnum
= 0; right
[rnum
] != NULL
; ++rnum
)
3345 temp
= comsubs(left
[lnum
], right
[rnum
]);
3351 both
= addlists(both
, temp
);
3370 resetmust (must
*mp
)
3372 mp
->left
[0] = mp
->right
[0] = mp
->is
[0] = '\0';
3377 dfamust (struct dfa
*d
)
3388 static char empty_string
[] = "";
3390 result
= empty_string
;
3392 musts
= (must
*) malloc((d
->tindex
+ 1) * sizeof *musts
);
3396 for (i
= 0; i
<= d
->tindex
; ++i
)
3398 for (i
= 0; i
<= d
->tindex
; ++i
)
3400 mp
[i
].in
= (char **) malloc(sizeof *mp
[i
].in
);
3401 mp
[i
].left
= malloc(2);
3402 mp
[i
].right
= malloc(2);
3403 mp
[i
].is
= malloc(2);
3404 if (mp
[i
].in
== NULL
|| mp
[i
].left
== NULL
||
3405 mp
[i
].right
== NULL
|| mp
[i
].is
== NULL
)
3407 mp
[i
].left
[0] = mp
[i
].right
[0] = mp
[i
].is
[0] = '\0';
3411 fprintf(stderr
, "dfamust:\n");
3412 for (i
= 0; i
< d
->tindex
; ++i
)
3414 fprintf(stderr
, " %d:", i
);
3415 prtok(d
->tokens
[i
]);
3419 for (ri
= 0; ri
< d
->tindex
; ++ri
)
3421 switch (t
= d
->tokens
[ri
])
3425 goto done
; /* "cannot happen" */
3439 goto done
; /* "cannot happen" */
3446 goto done
; /* "cannot happen" */
3455 /* Guaranteed to be. Unlikely, but. . . */
3456 if (strcmp(lmp
->is
, rmp
->is
) != 0)
3458 /* Left side--easy */
3460 while (lmp
->left
[i
] != '\0' && lmp
->left
[i
] == rmp
->left
[i
])
3462 lmp
->left
[i
] = '\0';
3464 ln
= strlen(lmp
->right
);
3465 rn
= strlen(rmp
->right
);
3469 for (i
= 0; i
< n
; ++i
)
3470 if (lmp
->right
[ln
- i
- 1] != rmp
->right
[rn
- i
- 1])
3472 for (j
= 0; j
< i
; ++j
)
3473 lmp
->right
[j
] = lmp
->right
[(ln
- i
) + j
];
3474 lmp
->right
[j
] = '\0';
3475 new = inboth(lmp
->in
, rmp
->in
);
3485 goto done
; /* "cannot happen" */
3490 if (mp
!= &musts
[1])
3491 goto done
; /* "cannot happen" */
3492 for (i
= 0; musts
[0].in
[i
] != NULL
; ++i
)
3493 if (strlen(musts
[0].in
[i
]) > strlen(result
))
3494 result
= musts
[0].in
[i
];
3495 if (strcmp(result
, musts
[0].is
) == 0)
3500 goto done
; /* "cannot happen" */
3507 /* In. Everything in left, plus everything in
3508 right, plus catenation of
3509 left's right and right's left. */
3510 lmp
->in
= addlists(lmp
->in
, rmp
->in
);
3511 if (lmp
->in
== NULL
)
3513 if (lmp
->right
[0] != '\0' &&
3514 rmp
->left
[0] != '\0')
3518 tp
= icpyalloc(lmp
->right
);
3521 tp
= icatalloc(tp
, rmp
->left
);
3524 lmp
->in
= enlist(lmp
->in
, tp
,
3527 if (lmp
->in
== NULL
)
3531 if (lmp
->is
[0] != '\0')
3533 lmp
->left
= icatalloc(lmp
->left
,
3535 if (lmp
->left
== NULL
)
3539 if (rmp
->is
[0] == '\0')
3540 lmp
->right
[0] = '\0';
3541 lmp
->right
= icatalloc(lmp
->right
, rmp
->right
);
3542 if (lmp
->right
== NULL
)
3544 /* Guaranteed to be */
3545 if (lmp
->is
[0] != '\0' && rmp
->is
[0] != '\0')
3547 lmp
->is
= icatalloc(lmp
->is
, rmp
->is
);
3548 if (lmp
->is
== NULL
)
3558 /* "cannot happen" */
3563 /* not on *my* shift */
3570 #endif /* MBS_SUPPORT */
3578 /* plain character */
3580 mp
->is
[0] = mp
->left
[0] = mp
->right
[0] = t
;
3581 mp
->is
[1] = mp
->left
[1] = mp
->right
[1] = '\0';
3582 mp
->in
= enlist(mp
->in
, mp
->is
, (size_t)1);
3589 fprintf(stderr
, " node: %d:", ri
);
3590 prtok(d
->tokens
[ri
]);
3591 fprintf(stderr
, "\n in:");
3592 for (i
= 0; mp
->in
[i
]; ++i
)
3593 fprintf(stderr
, " \"%s\"", mp
->in
[i
]);
3594 fprintf(stderr
, "\n is: \"%s\"\n", mp
->is
);
3595 fprintf(stderr
, " left: \"%s\"\n", mp
->left
);
3596 fprintf(stderr
, " right: \"%s\"\n", mp
->right
);
3603 MALLOC(dm
, struct dfamust
, 1);
3605 MALLOC(dm
->must
, char, strlen(result
) + 1);
3606 strcpy(dm
->must
, result
);
3607 dm
->next
= d
->musts
;
3611 for (i
= 0; i
<= d
->tindex
; ++i
)
3621 /* vim:set shiftwidth=2: */