build: avoid another warning
[grep/ericb.git] / src / dfa.c
blobc2ef18c72bb2435205b4a7f646f88df477e302d8
1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004, 2005, 2007-2010 Free Software
3 Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
23 #include <config.h>
24 #include <assert.h>
25 #include <ctype.h>
26 #include <stdio.h>
27 #include <sys/types.h>
28 #include <stddef.h>
29 #include <stdlib.h>
30 #include <limits.h>
31 #include <string.h>
32 #include <locale.h>
34 #ifndef isgraph
35 #define isgraph(C) (isprint(C) && !isspace(C))
36 #endif
38 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
39 #define ISALPHA(C) isalpha(C)
40 #define ISUPPER(C) isupper(C)
41 #define ISLOWER(C) islower(C)
42 #define ISDIGIT(C) isdigit(C)
43 #define ISXDIGIT(C) isxdigit(C)
44 #define ISSPACE(C) isspace(C)
45 #define ISPUNCT(C) ispunct(C)
46 #define ISALNUM(C) isalnum(C)
47 #define ISPRINT(C) isprint(C)
48 #define ISGRAPH(C) isgraph(C)
49 #define ISCNTRL(C) iscntrl(C)
50 #else
51 #define ISALPHA(C) (isascii(C) && isalpha(C))
52 #define ISUPPER(C) (isascii(C) && isupper(C))
53 #define ISLOWER(C) (isascii(C) && islower(C))
54 #define ISDIGIT(C) (isascii(C) && isdigit(C))
55 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
56 #define ISSPACE(C) (isascii(C) && isspace(C))
57 #define ISPUNCT(C) (isascii(C) && ispunct(C))
58 #define ISALNUM(C) (isascii(C) && isalnum(C))
59 #define ISPRINT(C) (isascii(C) && isprint(C))
60 #define ISGRAPH(C) (isascii(C) && isgraph(C))
61 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
62 #endif
64 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
65 - Its arg may be any int or unsigned int; it need not be an unsigned char.
66 - It's guaranteed to evaluate its argument exactly once.
67 - It's typically faster.
68 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
69 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
70 it's important to use the locale's definition of `digit' even when the
71 host does not conform to Posix. */
72 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
74 /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
75 #include "gettext.h"
76 #define _(str) gettext (str)
78 #include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
79 #ifdef MBS_SUPPORT
80 /* We can handle multibyte strings. */
81 # include <wchar.h>
82 # include <wctype.h>
83 # include <langinfo.h>
84 #endif
86 #include "regex.h"
87 #include "dfa.h"
88 #include "hard-locale.h"
89 #include "xalloc.h"
91 /* HPUX, define those as macros in sys/param.h */
92 #ifdef setbit
93 # undef setbit
94 #endif
95 #ifdef clrbit
96 # undef clrbit
97 #endif
99 static void dfamust (struct dfa *dfa);
100 static void regexp (int toplevel);
102 #define CALLOC(p, t, n) ((p) = xcalloc((size_t)(n), sizeof (t)))
103 #define MALLOC(p, t, n) ((p) = xmalloc((n) * sizeof (t)))
104 #define REALLOC(p, t, n) ((p) = xrealloc((p), (n) * sizeof (t)))
106 /* Reallocate an array of type t if nalloc is too small for index. */
107 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
108 if ((index) >= (nalloc)) \
110 do \
111 (nalloc) *= 2; \
112 while ((index) >= (nalloc)); \
113 REALLOC(p, t, nalloc); \
116 #ifdef DEBUG
118 static void
119 prtok (token t)
121 char const *s;
123 if (t < 0)
124 fprintf(stderr, "END");
125 else if (t < NOTCHAR)
126 fprintf(stderr, "%c", t);
127 else
129 switch (t)
131 case EMPTY: s = "EMPTY"; break;
132 case BACKREF: s = "BACKREF"; break;
133 case BEGLINE: s = "BEGLINE"; break;
134 case ENDLINE: s = "ENDLINE"; break;
135 case BEGWORD: s = "BEGWORD"; break;
136 case ENDWORD: s = "ENDWORD"; break;
137 case LIMWORD: s = "LIMWORD"; break;
138 case NOTLIMWORD: s = "NOTLIMWORD"; break;
139 case QMARK: s = "QMARK"; break;
140 case STAR: s = "STAR"; break;
141 case PLUS: s = "PLUS"; break;
142 case CAT: s = "CAT"; break;
143 case OR: s = "OR"; break;
144 case ORTOP: s = "ORTOP"; break;
145 case LPAREN: s = "LPAREN"; break;
146 case RPAREN: s = "RPAREN"; break;
147 #ifdef MBS_SUPPORT
148 case ANYCHAR: s = "ANYCHAR"; break;
149 case MBCSET: s = "MBCSET"; break;
150 #endif /* MBS_SUPPORT */
151 default: s = "CSET"; break;
153 fprintf(stderr, "%s", s);
156 #endif /* DEBUG */
158 /* Stuff pertaining to charclasses. */
160 static int
161 tstbit (unsigned int b, charclass c)
163 return c[b / INTBITS] & 1 << b % INTBITS;
166 static void
167 setbit (unsigned int b, charclass c)
169 c[b / INTBITS] |= 1 << b % INTBITS;
172 static void
173 clrbit (unsigned int b, charclass c)
175 c[b / INTBITS] &= ~(1 << b % INTBITS);
178 static void
179 copyset (charclass src, charclass dst)
181 memcpy (dst, src, sizeof (charclass));
184 static void
185 zeroset (charclass s)
187 memset (s, 0, sizeof (charclass));
190 static void
191 notset (charclass s)
193 int i;
195 for (i = 0; i < CHARCLASS_INTS; ++i)
196 s[i] = ~s[i];
199 static int
200 equal (charclass s1, charclass s2)
202 return memcmp (s1, s2, sizeof (charclass)) == 0;
205 /* A pointer to the current dfa is kept here during parsing. */
206 static struct dfa *dfa;
208 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
209 static int
210 charclass_index (charclass s)
212 int i;
214 for (i = 0; i < dfa->cindex; ++i)
215 if (equal(s, dfa->charclasses[i]))
216 return i;
217 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
218 ++dfa->cindex;
219 copyset(s, dfa->charclasses[i]);
220 return i;
223 /* Syntax bits controlling the behavior of the lexical analyzer. */
224 static reg_syntax_t syntax_bits, syntax_bits_set;
226 /* Flag for case-folding letters into sets. */
227 static int case_fold;
229 /* End-of-line byte in data. */
230 static unsigned char eolbyte;
232 /* Entry point to set syntax options. */
233 void
234 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
236 syntax_bits_set = 1;
237 syntax_bits = bits;
238 case_fold = fold;
239 eolbyte = eol;
242 /* Like setbit, but if case is folded, set both cases of a letter.
243 For MB_CUR_MAX > 1, one or both of the two cases may not be set,
244 so the resulting charset may only be used as an optimization. */
245 static void
246 setbit_case_fold (
247 #ifdef MBS_SUPPORT
248 wint_t b,
249 #else
250 unsigned int b,
251 #endif
252 charclass c)
254 if (case_fold)
256 #ifdef MBS_SUPPORT
257 if (MB_CUR_MAX > 1)
259 wint_t b1 = iswupper(b) ? towlower(b) : b;
260 wint_t b2 = iswlower(b) ? towupper(b) : b;
261 if (wctob ((unsigned char)b1) == b1)
262 setbit (b1, c);
263 if (b2 != b1 && wctob ((unsigned char)b2) == b2)
264 setbit (b2, c);
266 else
267 #endif
269 unsigned char b1 = ISUPPER(b) ? tolower(b) : b;
270 unsigned char b2 = ISLOWER(b) ? toupper(b) : b;
271 setbit (b1, c);
272 if (b2 != b1)
273 setbit (b2, c);
276 else
278 #ifdef MBS_SUPPORT
279 if (wctob ((unsigned char)b) == b)
280 #endif
281 setbit (b, c);
286 /* UTF-8 encoding allows some optimizations that we can't otherwise
287 assume in a multibyte encoding. */
288 static inline int
289 using_utf8 (void)
291 static int utf8 = -1;
292 if (utf8 == -1)
294 #if defined HAVE_LANGINFO_CODESET && defined MBS_SUPPORT
295 utf8 = (strcmp (nl_langinfo (CODESET), "UTF-8") == 0);
296 #else
297 utf8 = 0;
298 #endif
301 return utf8;
304 /* Lexical analyzer. All the dross that deals with the obnoxious
305 GNU Regex syntax bits is located here. The poor, suffering
306 reader is referred to the GNU Regex documentation for the
307 meaning of the @#%!@#%^!@ syntax bits. */
309 static char const *lexptr; /* Pointer to next input character. */
310 static int lexleft; /* Number of characters remaining. */
311 static token lasttok; /* Previous token returned; initially END. */
312 static int laststart; /* True if we're separated from beginning or (, |
313 only by zero-width characters. */
314 static int parens; /* Count of outstanding left parens. */
315 static int minrep, maxrep; /* Repeat counts for {m,n}. */
316 static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
318 static int cur_mb_len = 1; /* Length of the multibyte representation of
319 wctok. */
320 #ifdef MBS_SUPPORT
321 /* These variables are used only if (MB_CUR_MAX > 1). */
322 static mbstate_t mbs; /* Mbstate for mbrlen(). */
323 static wchar_t wctok; /* Wide character representation of the current
324 multibyte character. */
325 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
326 Each element store the amount of remain
327 byte of corresponding multibyte character
328 in the input string. A element's value
329 is 0 if corresponding character is a
330 single byte chracter.
331 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
332 mblen_buf : 0, 3, 2, 1
334 static wchar_t *inputwcs; /* Wide character representation of input
335 string in dfaexec().
336 The length of this array is same as
337 the length of input string(char array).
338 inputstring[i] is a single-byte char,
339 or 1st byte of a multibyte char.
340 And inputwcs[i] is the codepoint. */
341 static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
342 static unsigned char const *buf_end; /* reference to end in dfaexec(). */
343 #endif /* MBS_SUPPORT */
346 #ifdef MBS_SUPPORT
347 /* Note that characters become unsigned here. */
348 # define FETCH_WC(c, wc, eoferr) \
349 do { \
350 if (! lexleft) \
352 if ((eoferr) != 0) \
353 dfaerror (eoferr); \
354 else \
355 return lasttok = END; \
357 else \
359 wchar_t _wc; \
360 cur_mb_len = mbrtowc(&_wc, lexptr, lexleft, &mbs); \
361 if (cur_mb_len <= 0) \
363 cur_mb_len = 1; \
364 --lexleft; \
365 (wc) = (c) = (unsigned char) *lexptr++; \
367 else \
369 lexptr += cur_mb_len; \
370 lexleft -= cur_mb_len; \
371 (wc) = _wc; \
372 (c) = wctob(wc); \
375 } while(0)
377 # define FETCH(c, eoferr) \
378 do { \
379 wint_t wc; \
380 FETCH_WC(c, wc, eoferr); \
381 } while(0)
383 #else
384 /* Note that characters become unsigned here. */
385 # define FETCH(c, eoferr) \
386 do { \
387 if (! lexleft) \
389 if ((eoferr) != 0) \
390 dfaerror (eoferr); \
391 else \
392 return lasttok = END; \
394 (c) = (unsigned char) *lexptr++; \
395 --lexleft; \
396 } while(0)
398 # define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
400 #endif /* MBS_SUPPORT */
402 static int
403 in_coll_range (char ch, char from, char to)
405 char c[6] = { from, 0, ch, 0, to, 0 };
406 return strcoll (&c[0], &c[2]) <= 0 && strcoll (&c[2], &c[4]) <= 0;
409 static int is_alpha(int c) { return ISALPHA(c); }
410 static int is_upper(int c) { return ISUPPER(c); }
411 static int is_lower(int c) { return ISLOWER(c); }
412 static int is_digit(int c) { return ISDIGIT(c); }
413 static int is_xdigit(int c) { return ISXDIGIT(c); }
414 static int is_space(int c) { return ISSPACE(c); }
415 static int is_punct(int c) { return ISPUNCT(c); }
416 static int is_alnum(int c) { return ISALNUM(c); }
417 static int is_print(int c) { return ISPRINT(c); }
418 static int is_graph(int c) { return ISGRAPH(c); }
419 static int is_cntrl(int c) { return ISCNTRL(c); }
421 static int
422 is_blank (int c)
424 return (c == ' ' || c == '\t');
427 typedef int predicate (int);
429 /* The following list maps the names of the Posix named character classes
430 to predicate functions that determine whether a given character is in
431 the class. The leading [ has already been eaten by the lexical analyzer. */
432 static struct {
433 const char *name;
434 predicate *pred;
435 } const prednames[] = {
436 { "alpha", is_alpha },
437 { "upper", is_upper },
438 { "lower", is_lower },
439 { "digit", is_digit },
440 { "xdigit", is_xdigit },
441 { "space", is_space },
442 { "punct", is_punct },
443 { "alnum", is_alnum },
444 { "print", is_print },
445 { "graph", is_graph },
446 { "cntrl", is_cntrl },
447 { "blank", is_blank },
448 { NULL, NULL }
451 static predicate *
452 find_pred (const char *str)
454 unsigned int i;
455 for (i = 0; prednames[i].name; ++i)
456 if (!strcmp(str, prednames[i].name))
457 break;
459 return prednames[i].pred;
462 /* Multibyte character handling sub-routine for lex.
463 This function parse a bracket expression and build a struct
464 mb_char_classes. */
465 static token
466 parse_bracket_exp (void)
468 int invert;
469 int c, c1, c2;
470 charclass ccl;
472 #ifdef MBS_SUPPORT
473 wint_t wc, wc1, wc2;
475 /* Work area to build a mb_char_classes. */
476 struct mb_char_classes *work_mbc;
477 int chars_al, range_sts_al, range_ends_al, ch_classes_al,
478 equivs_al, coll_elems_al;
480 chars_al = 1;
481 range_sts_al = range_ends_al = 0;
482 ch_classes_al = equivs_al = coll_elems_al = 0;
483 if (MB_CUR_MAX > 1)
485 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
486 dfa->mbcsets_alloc, dfa->nmbcsets + 1);
488 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
489 We will update dfa->multibyte_prop[] in addtok(), because we can't
490 decide the index in dfa->tokens[]. */
492 /* Initialize work area. */
493 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
494 memset (work_mbc, 0, sizeof *work_mbc);
496 else
497 work_mbc = NULL;
498 #endif
500 memset (ccl, 0, sizeof(ccl));
501 FETCH_WC (c, wc, _("unbalanced ["));
502 if (c == '^')
504 FETCH_WC (c, wc, _("unbalanced ["));
505 invert = 1;
507 else
508 invert = 0;
512 c1 = EOF; /* mark c1 is not initialized". */
514 /* Note that if we're looking at some other [:...:] construct,
515 we just treat it as a bunch of ordinary characters. We can do
516 this because we assume regex has checked for syntax errors before
517 dfa is ever called. */
518 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
520 #define BRACKET_BUFFER_SIZE 128
521 char str[BRACKET_BUFFER_SIZE];
522 FETCH_WC (c1, wc1, _("unbalanced ["));
524 /* If pattern contains `[[:', `[[.', or `[[='. */
525 if (c1 == ':'
526 #ifdef MBS_SUPPORT
527 /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
528 || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))
529 #endif
532 int len = 0;
533 for (;;)
535 FETCH_WC (c, wc, _("unbalanced ["));
536 if ((c == c1 && *lexptr == ']') || lexleft == 0)
537 break;
538 if (len < BRACKET_BUFFER_SIZE)
539 str[len++] = c;
540 else
541 /* This is in any case an invalid class name. */
542 str[0] = '\0';
544 str[len] = '\0';
546 /* Fetch bracket. */
547 FETCH_WC (c, wc, _("unbalanced ["));
548 if (c1 == ':')
549 /* build character class. */
551 char const *class
552 = (case_fold && (!strcmp (str, "upper")
553 || !strcmp (str, "lower"))
554 ? "alpha"
555 : str);
556 #ifdef MBS_SUPPORT
557 if (MB_CUR_MAX > 1)
559 /* Store the character class as wctype_t. */
560 wctype_t wt = wctype (class);
562 if (ch_classes_al == 0)
563 MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
564 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
565 ch_classes_al,
566 work_mbc->nch_classes + 1);
567 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
569 #endif
572 predicate *pred = find_pred (class);
573 if (!pred)
574 dfaerror(_("invalid character class"));
575 for (c2 = 0; c2 < NOTCHAR; ++c2)
576 if ((*pred)(c2))
577 setbit_case_fold (c2, ccl);
581 #ifdef MBS_SUPPORT
582 else if (c1 == '=' || c1 == '.')
584 char *elem;
585 MALLOC(elem, char, len + 1);
586 strncpy(elem, str, len + 1);
588 if (c1 == '=')
589 /* build equivalent class. */
591 if (equivs_al == 0)
592 MALLOC(work_mbc->equivs, char*, ++equivs_al);
593 REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
594 equivs_al,
595 work_mbc->nequivs + 1);
596 work_mbc->equivs[work_mbc->nequivs++] = elem;
599 if (c1 == '.')
600 /* build collating element. */
602 if (coll_elems_al == 0)
603 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
604 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
605 coll_elems_al,
606 work_mbc->ncoll_elems + 1);
607 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
610 #endif
612 /* Fetch new lookahead character. */
613 FETCH_WC (c1, wc1, _("unbalanced ["));
614 continue;
617 /* We treat '[' as a normal character here. c/c1/wc/wc1
618 are already set up. */
621 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
622 FETCH_WC(c, wc, _("unbalanced ["));
624 if (c1 == EOF)
625 FETCH_WC(c1, wc1, _("unbalanced ["));
627 if (c1 == '-')
628 /* build range characters. */
630 FETCH_WC(c2, wc2, _("unbalanced ["));
631 if (c2 == ']')
633 /* In the case [x-], the - is an ordinary hyphen,
634 which is left in c1, the lookahead character. */
635 lexptr -= cur_mb_len;
636 lexleft += cur_mb_len;
640 if (c1 == '-' && c2 != ']')
642 if (c2 == '\\'
643 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
644 FETCH_WC(c2, wc2, _("unbalanced ["));
646 #ifdef MBS_SUPPORT
647 if (MB_CUR_MAX > 1)
649 /* When case folding map a range, say [m-z] (or even [M-z])
650 to the pair of ranges, [m-z] [M-Z]. */
651 if (range_sts_al == 0)
653 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
654 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
656 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
657 range_sts_al, work_mbc->nranges + 1);
658 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
659 range_ends_al, work_mbc->nranges + 1);
660 work_mbc->range_sts[work_mbc->nranges] =
661 case_fold ? towlower(wc) : (wchar_t)wc;
662 work_mbc->range_ends[work_mbc->nranges++] =
663 case_fold ? towlower(wc2) : (wchar_t)wc2;
665 #ifndef GREP
666 if (case_fold && (iswalpha(wc) || iswalpha(wc2)))
668 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
669 range_sts_al, work_mbc->nranges + 1);
670 work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
671 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
672 range_ends_al, work_mbc->nranges + 1);
673 work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
675 #endif
677 else
678 #endif
680 c1 = c;
681 if (case_fold)
683 c1 = tolower (c1);
684 c2 = tolower (c2);
686 if (!hard_LC_COLLATE)
687 for (c = c1; c <= c2; c++)
688 setbit_case_fold (c, ccl);
689 else
690 for (c = 0; c < NOTCHAR; ++c)
691 if (!(case_fold && ISUPPER (c))
692 && in_coll_range (c, c1, c2))
693 setbit_case_fold (c, ccl);
696 FETCH_WC(c1, wc1, _("unbalanced ["));
697 continue;
700 #ifdef MBS_SUPPORT
701 /* Build normal characters. */
702 setbit_case_fold (wc, ccl);
703 if (MB_CUR_MAX > 1)
705 if (case_fold && iswalpha(wc))
707 wc = towlower(wc);
708 c = wctob(wc);
709 if (c == EOF || (wint_t)c != (wint_t)wc)
711 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
712 work_mbc->nchars + 1);
713 work_mbc->chars[work_mbc->nchars++] = wc;
715 #ifdef GREP
716 continue;
717 #else
718 wc = towupper(wc);
719 c = wctob(wc);
720 #endif
722 if (c == EOF || (wint_t)c != (wint_t)wc)
724 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
725 work_mbc->nchars + 1);
726 work_mbc->chars[work_mbc->nchars++] = wc;
729 #else
730 setbit_case_fold (c, ccl);
731 #endif
733 while ((
734 #ifdef MBS_SUPPORT
735 wc = wc1,
736 #endif
737 (c = c1) != ']'));
739 #ifdef MBS_SUPPORT
740 if (MB_CUR_MAX > 1
741 && (!using_utf8()
742 || invert
743 || work_mbc->nchars != 0
744 || work_mbc->nch_classes != 0
745 || work_mbc->nranges != 0
746 || work_mbc->nequivs != 0
747 || work_mbc->ncoll_elems != 0))
749 static charclass zeroclass;
750 work_mbc->invert = invert;
751 work_mbc->cset = equal(ccl, zeroclass) ? -1 : charclass_index(ccl);
752 return MBCSET;
754 #endif
756 if (invert)
758 #ifdef MBS_SUPPORT
759 assert(MB_CUR_MAX == 1);
760 #endif
761 notset(ccl);
762 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
763 clrbit(eolbyte, ccl);
766 return CSET + charclass_index(ccl);
769 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
770 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
772 static token
773 lex (void)
775 unsigned int c, c2;
776 int backslash = 0;
777 charclass ccl;
778 int i;
780 /* Basic plan: We fetch a character. If it's a backslash,
781 we set the backslash flag and go through the loop again.
782 On the plus side, this avoids having a duplicate of the
783 main switch inside the backslash case. On the minus side,
784 it means that just about every case begins with
785 "if (backslash) ...". */
786 for (i = 0; i < 2; ++i)
788 #ifdef MBS_SUPPORT
789 if (MB_CUR_MAX > 1)
791 FETCH_WC (c, wctok, NULL);
792 if ((int)c == EOF)
793 goto normal_char;
795 else
796 #endif /* MBS_SUPPORT */
797 FETCH(c, NULL);
799 switch (c)
801 case '\\':
802 if (backslash)
803 goto normal_char;
804 if (lexleft == 0)
805 dfaerror(_("unfinished \\ escape"));
806 backslash = 1;
807 break;
809 case '^':
810 if (backslash)
811 goto normal_char;
812 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
813 || lasttok == END
814 || lasttok == LPAREN
815 || lasttok == OR)
816 return lasttok = BEGLINE;
817 goto normal_char;
819 case '$':
820 if (backslash)
821 goto normal_char;
822 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
823 || lexleft == 0
824 || (syntax_bits & RE_NO_BK_PARENS
825 ? lexleft > 0 && *lexptr == ')'
826 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
827 || (syntax_bits & RE_NO_BK_VBAR
828 ? lexleft > 0 && *lexptr == '|'
829 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
830 || ((syntax_bits & RE_NEWLINE_ALT)
831 && lexleft > 0 && *lexptr == '\n'))
832 return lasttok = ENDLINE;
833 goto normal_char;
835 case '1':
836 case '2':
837 case '3':
838 case '4':
839 case '5':
840 case '6':
841 case '7':
842 case '8':
843 case '9':
844 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
846 laststart = 0;
847 return lasttok = BACKREF;
849 goto normal_char;
851 case '`':
852 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
853 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
854 goto normal_char;
856 case '\'':
857 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
858 return lasttok = ENDLINE; /* FIXME: should be end of string */
859 goto normal_char;
861 case '<':
862 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
863 return lasttok = BEGWORD;
864 goto normal_char;
866 case '>':
867 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
868 return lasttok = ENDWORD;
869 goto normal_char;
871 case 'b':
872 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
873 return lasttok = LIMWORD;
874 goto normal_char;
876 case 'B':
877 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
878 return lasttok = NOTLIMWORD;
879 goto normal_char;
881 case '?':
882 if (syntax_bits & RE_LIMITED_OPS)
883 goto normal_char;
884 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
885 goto normal_char;
886 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
887 goto normal_char;
888 return lasttok = QMARK;
890 case '*':
891 if (backslash)
892 goto normal_char;
893 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
894 goto normal_char;
895 return lasttok = STAR;
897 case '+':
898 if (syntax_bits & RE_LIMITED_OPS)
899 goto normal_char;
900 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
901 goto normal_char;
902 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
903 goto normal_char;
904 return lasttok = PLUS;
906 case '{':
907 if (!(syntax_bits & RE_INTERVALS))
908 goto normal_char;
909 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
910 goto normal_char;
911 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
912 goto normal_char;
914 if (syntax_bits & RE_NO_BK_BRACES)
916 /* Scan ahead for a valid interval; if it's not valid,
917 treat it as a literal '{'. */
918 int lo = -1, hi = -1;
919 char const *p = lexptr;
920 char const *lim = p + lexleft;
921 for (; p != lim && ISASCIIDIGIT (*p); p++)
922 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
923 if (p != lim && *p == ',')
924 while (++p != lim && ISASCIIDIGIT (*p))
925 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
926 else
927 hi = lo;
928 if (p == lim || *p != '}'
929 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
930 goto normal_char;
933 minrep = 0;
934 /* Cases:
935 {M} - exact count
936 {M,} - minimum count, maximum is infinity
937 {M,N} - M through N */
938 FETCH(c, _("unfinished repeat count"));
939 if (ISASCIIDIGIT (c))
941 minrep = c - '0';
942 for (;;)
944 FETCH(c, _("unfinished repeat count"));
945 if (! ISASCIIDIGIT (c))
946 break;
947 minrep = 10 * minrep + c - '0';
950 else
951 dfaerror(_("malformed repeat count"));
952 if (c == ',')
954 FETCH (c, _("unfinished repeat count"));
955 if (! ISASCIIDIGIT (c))
956 maxrep = -1;
957 else
959 maxrep = c - '0';
960 for (;;)
962 FETCH (c, _("unfinished repeat count"));
963 if (! ISASCIIDIGIT (c))
964 break;
965 maxrep = 10 * maxrep + c - '0';
967 if (0 <= maxrep && maxrep < minrep)
968 dfaerror (_("malformed repeat count"));
971 else
972 maxrep = minrep;
973 if (!(syntax_bits & RE_NO_BK_BRACES))
975 if (c != '\\')
976 dfaerror(_("malformed repeat count"));
977 FETCH(c, _("unfinished repeat count"));
979 if (c != '}')
980 dfaerror(_("malformed repeat count"));
981 laststart = 0;
982 #ifdef GAWK
983 dfa->broken = (minrep == maxrep && minrep == 0);
984 #endif
985 return lasttok = REPMN;
987 case '|':
988 if (syntax_bits & RE_LIMITED_OPS)
989 goto normal_char;
990 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
991 goto normal_char;
992 laststart = 1;
993 return lasttok = OR;
995 case '\n':
996 if (syntax_bits & RE_LIMITED_OPS
997 || backslash
998 || !(syntax_bits & RE_NEWLINE_ALT))
999 goto normal_char;
1000 laststart = 1;
1001 return lasttok = OR;
1003 case '(':
1004 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1005 goto normal_char;
1006 ++parens;
1007 laststart = 1;
1008 return lasttok = LPAREN;
1010 case ')':
1011 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1012 goto normal_char;
1013 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1014 goto normal_char;
1015 --parens;
1016 laststart = 0;
1017 return lasttok = RPAREN;
1019 case '.':
1020 if (backslash)
1021 goto normal_char;
1022 #ifdef MBS_SUPPORT
1023 if (MB_CUR_MAX > 1)
1025 /* In multibyte environment period must match with a single
1026 character not a byte. So we use ANYCHAR. */
1027 laststart = 0;
1028 return lasttok = ANYCHAR;
1030 #endif /* MBS_SUPPORT */
1031 zeroset(ccl);
1032 notset(ccl);
1033 if (!(syntax_bits & RE_DOT_NEWLINE))
1034 clrbit(eolbyte, ccl);
1035 if (syntax_bits & RE_DOT_NOT_NULL)
1036 clrbit('\0', ccl);
1037 laststart = 0;
1038 return lasttok = CSET + charclass_index(ccl);
1040 #ifndef GAWK
1041 case 's':
1042 case 'S':
1043 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1044 goto normal_char;
1045 zeroset(ccl);
1046 for (c2 = 0; c2 < NOTCHAR; ++c2)
1047 if (ISSPACE(c2))
1048 setbit(c2, ccl);
1049 if (c == 'S')
1050 notset(ccl);
1051 laststart = 0;
1052 return lasttok = CSET + charclass_index(ccl);
1053 #endif
1055 case 'w':
1056 case 'W':
1057 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1058 goto normal_char;
1059 zeroset(ccl);
1060 for (c2 = 0; c2 < NOTCHAR; ++c2)
1061 if (IS_WORD_CONSTITUENT(c2))
1062 setbit(c2, ccl);
1063 if (c == 'W')
1064 notset(ccl);
1065 laststart = 0;
1066 return lasttok = CSET + charclass_index(ccl);
1068 case '[':
1069 if (backslash)
1070 goto normal_char;
1071 laststart = 0;
1072 return lasttok = parse_bracket_exp();
1074 default:
1075 normal_char:
1076 laststart = 0;
1077 #ifdef MBS_SUPPORT
1078 /* For multibyte character sets, folding is done in atom. Always
1079 return WCHAR. */
1080 if (MB_CUR_MAX > 1)
1081 return lasttok = WCHAR;
1082 #endif
1084 if (case_fold && ISALPHA(c))
1086 zeroset(ccl);
1087 setbit_case_fold (c, ccl);
1088 return lasttok = CSET + charclass_index(ccl);
1091 return lasttok = c;
1095 /* The above loop should consume at most a backslash
1096 and some other character. */
1097 abort();
1098 return END; /* keeps pedantic compilers happy. */
1101 /* Recursive descent parser for regular expressions. */
1103 static token tok; /* Lookahead token. */
1104 static int depth; /* Current depth of a hypothetical stack
1105 holding deferred productions. This is
1106 used to determine the depth that will be
1107 required of the real stack later on in
1108 dfaanalyze(). */
1110 static void
1111 addtok_mb (token t, int mbprop)
1113 #ifdef MBS_SUPPORT
1114 if (MB_CUR_MAX > 1)
1116 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1117 dfa->tindex);
1118 dfa->multibyte_prop[dfa->tindex] = mbprop;
1120 #else
1121 (void) mbprop;
1122 #endif
1124 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1125 dfa->tokens[dfa->tindex++] = t;
1127 switch (t)
1129 case QMARK:
1130 case STAR:
1131 case PLUS:
1132 break;
1134 case CAT:
1135 case OR:
1136 case ORTOP:
1137 --depth;
1138 break;
1140 default:
1141 ++dfa->nleaves;
1142 case EMPTY:
1143 ++depth;
1144 break;
1146 if (depth > dfa->depth)
1147 dfa->depth = depth;
1150 /* Add the given token to the parse tree, maintaining the depth count and
1151 updating the maximum depth if necessary. */
1152 static void
1153 addtok (token t)
1155 #ifdef MBS_SUPPORT
1156 if (MB_CUR_MAX > 1 && t == MBCSET)
1157 addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1158 else
1159 #endif
1160 addtok_mb (t, 3);
1163 #ifdef MBS_SUPPORT
1164 /* We treat a multibyte character as a single atom, so that DFA
1165 can treat a multibyte character as a single expression.
1167 e.g. We construct following tree from "<mb1><mb2>".
1168 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1169 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1170 static void
1171 addtok_wc (wint_t wc)
1173 unsigned char buf[MB_LEN_MAX];
1174 mbstate_t s;
1175 int i;
1176 memset (&s, 0, sizeof(s));
1177 cur_mb_len = wcrtomb ((char *) buf, wc, &s);
1178 addtok_mb(buf[0], cur_mb_len == 1 ? 3 : 1);
1179 for (i = 1; i < cur_mb_len; i++)
1181 addtok_mb(buf[i], i == cur_mb_len - 1 ? 2 : 0);
1182 addtok(CAT);
1185 #endif
1187 /* The grammar understood by the parser is as follows.
1189 regexp:
1190 regexp OR branch
1191 branch
1193 branch:
1194 branch closure
1195 closure
1197 closure:
1198 closure QMARK
1199 closure STAR
1200 closure PLUS
1201 closure REPMN
1202 atom
1204 atom:
1205 <normal character>
1206 <multibyte character>
1207 ANYCHAR
1208 MBCSET
1209 CSET
1210 BACKREF
1211 BEGLINE
1212 ENDLINE
1213 BEGWORD
1214 ENDWORD
1215 LIMWORD
1216 NOTLIMWORD
1217 LPAREN regexp RPAREN
1218 <empty>
1220 The parser builds a parse tree in postfix form in an array of tokens. */
1222 static void
1223 atom (void)
1225 #ifdef MBS_SUPPORT
1226 if (tok == WCHAR)
1228 addtok_wc (case_fold ? towlower(wctok) : wctok);
1229 #ifndef GREP
1230 if (case_fold && iswalpha(wctok))
1232 addtok_wc (towupper(wctok));
1233 addtok (OR);
1235 #endif
1237 tok = lex();
1238 return;
1240 #endif /* MBS_SUPPORT */
1242 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1243 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1244 #ifdef MBS_SUPPORT
1245 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1246 #endif /* MBS_SUPPORT */
1247 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1249 addtok(tok);
1250 tok = lex();
1252 else if (tok == LPAREN)
1254 tok = lex();
1255 regexp(0);
1256 if (tok != RPAREN)
1257 dfaerror(_("unbalanced ("));
1258 tok = lex();
1260 else
1261 addtok(EMPTY);
1264 /* Return the number of tokens in the given subexpression. */
1265 static int
1266 nsubtoks (int tindex)
1268 int ntoks1;
1270 switch (dfa->tokens[tindex - 1])
1272 default:
1273 return 1;
1274 case QMARK:
1275 case STAR:
1276 case PLUS:
1277 return 1 + nsubtoks(tindex - 1);
1278 case CAT:
1279 case OR:
1280 case ORTOP:
1281 ntoks1 = nsubtoks(tindex - 1);
1282 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1286 /* Copy the given subexpression to the top of the tree. */
1287 static void
1288 copytoks (int tindex, int ntokens)
1290 int i;
1292 for (i = 0; i < ntokens; ++i)
1294 addtok(dfa->tokens[tindex + i]);
1295 #ifdef MBS_SUPPORT
1296 /* Update index into multibyte csets. */
1297 if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
1298 dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
1299 #endif
1303 static void
1304 closure (void)
1306 int tindex, ntokens, i;
1308 atom();
1309 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1310 if (tok == REPMN)
1312 ntokens = nsubtoks(dfa->tindex);
1313 tindex = dfa->tindex - ntokens;
1314 if (maxrep < 0)
1315 addtok(PLUS);
1316 if (minrep == 0)
1317 addtok(QMARK);
1318 for (i = 1; i < minrep; ++i)
1320 copytoks(tindex, ntokens);
1321 addtok(CAT);
1323 for (; i < maxrep; ++i)
1325 copytoks(tindex, ntokens);
1326 addtok(QMARK);
1327 addtok(CAT);
1329 tok = lex();
1331 else
1333 addtok(tok);
1334 tok = lex();
1338 static void
1339 branch (void)
1341 closure();
1342 while (tok != RPAREN && tok != OR && tok >= 0)
1344 closure();
1345 addtok(CAT);
1349 static void
1350 regexp (int toplevel)
1352 branch();
1353 while (tok == OR)
1355 tok = lex();
1356 branch();
1357 if (toplevel)
1358 addtok(ORTOP);
1359 else
1360 addtok(OR);
1364 /* Main entry point for the parser. S is a string to be parsed, len is the
1365 length of the string, so s can include NUL characters. D is a pointer to
1366 the struct dfa to parse into. */
1367 void
1368 dfaparse (char const *s, size_t len, struct dfa *d)
1370 dfa = d;
1371 lexptr = s;
1372 lexleft = len;
1373 lasttok = END;
1374 laststart = 1;
1375 parens = 0;
1376 #ifdef LC_COLLATE
1377 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1378 #endif
1379 #ifdef MBS_SUPPORT
1380 if (MB_CUR_MAX > 1)
1382 cur_mb_len = 0;
1383 memset(&mbs, 0, sizeof(mbstate_t));
1385 #endif /* MBS_SUPPORT */
1387 if (! syntax_bits_set)
1388 dfaerror(_("no syntax specified"));
1390 tok = lex();
1391 depth = d->depth;
1393 regexp(1);
1395 if (tok != END)
1396 dfaerror(_("unbalanced )"));
1398 addtok(END - d->nregexps);
1399 addtok(CAT);
1401 if (d->nregexps)
1402 addtok(ORTOP);
1404 ++d->nregexps;
1407 /* Some primitives for operating on sets of positions. */
1409 /* Copy one set to another; the destination must be large enough. */
1410 static void
1411 copy (position_set const *src, position_set *dst)
1413 int i;
1415 for (i = 0; i < src->nelem; ++i)
1416 dst->elems[i] = src->elems[i];
1417 dst->nelem = src->nelem;
1420 /* Insert a position in a set. Position sets are maintained in sorted
1421 order according to index. If position already exists in the set with
1422 the same index then their constraints are logically or'd together.
1423 S->elems must point to an array large enough to hold the resulting set. */
1424 static void
1425 insert (position p, position_set *s)
1427 int count = s->nelem;
1428 int lo = 0, hi = count;
1429 while (lo < hi)
1431 int mid = ((unsigned) lo + (unsigned) hi) >> 1;
1432 if (s->elems[mid].index < p.index)
1433 lo = mid + 1;
1434 else
1435 hi = mid;
1438 if (lo < count && p.index == s->elems[lo].index)
1439 s->elems[lo].constraint |= p.constraint;
1440 else
1442 int i;
1443 for (i = count; i > lo; i--)
1444 s->elems[i] = s->elems[i - 1];
1445 s->elems[lo] = p;
1446 ++s->nelem;
1450 /* Merge two sets of positions into a third. The result is exactly as if
1451 the positions of both sets were inserted into an initially empty set. */
1452 static void
1453 merge (position_set const *s1, position_set const *s2, position_set *m)
1455 int i = 0, j = 0;
1457 m->nelem = 0;
1458 while (i < s1->nelem && j < s2->nelem)
1459 if (s1->elems[i].index > s2->elems[j].index)
1460 m->elems[m->nelem++] = s1->elems[i++];
1461 else if (s1->elems[i].index < s2->elems[j].index)
1462 m->elems[m->nelem++] = s2->elems[j++];
1463 else
1465 m->elems[m->nelem] = s1->elems[i++];
1466 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1468 while (i < s1->nelem)
1469 m->elems[m->nelem++] = s1->elems[i++];
1470 while (j < s2->nelem)
1471 m->elems[m->nelem++] = s2->elems[j++];
1474 /* Delete a position from a set. */
1475 static void
1476 delete (position p, position_set *s)
1478 int i;
1480 for (i = 0; i < s->nelem; ++i)
1481 if (p.index == s->elems[i].index)
1482 break;
1483 if (i < s->nelem)
1484 for (--s->nelem; i < s->nelem; ++i)
1485 s->elems[i] = s->elems[i + 1];
1488 /* Find the index of the state corresponding to the given position set with
1489 the given preceding context, or create a new state if there is no such
1490 state. Newline and letter tell whether we got here on a newline or
1491 letter, respectively. */
1492 static int
1493 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1495 int hash = 0;
1496 int constraint;
1497 int i, j;
1499 newline = newline ? 1 : 0;
1500 letter = letter ? 1 : 0;
1502 for (i = 0; i < s->nelem; ++i)
1503 hash ^= s->elems[i].index + s->elems[i].constraint;
1505 /* Try to find a state that exactly matches the proposed one. */
1506 for (i = 0; i < d->sindex; ++i)
1508 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1509 || newline != d->states[i].newline || letter != d->states[i].letter)
1510 continue;
1511 for (j = 0; j < s->nelem; ++j)
1512 if (s->elems[j].constraint
1513 != d->states[i].elems.elems[j].constraint
1514 || s->elems[j].index != d->states[i].elems.elems[j].index)
1515 break;
1516 if (j == s->nelem)
1517 return i;
1520 /* We'll have to create a new state. */
1521 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1522 d->states[i].hash = hash;
1523 MALLOC(d->states[i].elems.elems, position, s->nelem);
1524 copy(s, &d->states[i].elems);
1525 d->states[i].newline = newline;
1526 d->states[i].letter = letter;
1527 d->states[i].backref = 0;
1528 d->states[i].constraint = 0;
1529 d->states[i].first_end = 0;
1530 #ifdef MBS_SUPPORT
1531 d->states[i].mbps.nelem = 0;
1532 d->states[i].mbps.elems = NULL;
1533 #endif
1534 for (j = 0; j < s->nelem; ++j)
1535 if (d->tokens[s->elems[j].index] < 0)
1537 constraint = s->elems[j].constraint;
1538 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1539 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1540 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1541 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1542 d->states[i].constraint |= constraint;
1543 if (! d->states[i].first_end)
1544 d->states[i].first_end = d->tokens[s->elems[j].index];
1546 else if (d->tokens[s->elems[j].index] == BACKREF)
1548 d->states[i].constraint = NO_CONSTRAINT;
1549 d->states[i].backref = 1;
1552 ++d->sindex;
1554 return i;
1557 /* Find the epsilon closure of a set of positions. If any position of the set
1558 contains a symbol that matches the empty string in some context, replace
1559 that position with the elements of its follow labeled with an appropriate
1560 constraint. Repeat exhaustively until no funny positions are left.
1561 S->elems must be large enough to hold the result. */
1562 static void
1563 epsclosure (position_set *s, struct dfa const *d)
1565 int i, j;
1566 char *visited; /* array of booleans, enough to use char, not int */
1567 position p, old;
1569 CALLOC(visited, char, d->tindex);
1571 for (i = 0; i < s->nelem; ++i)
1572 if (d->tokens[s->elems[i].index] >= NOTCHAR
1573 && d->tokens[s->elems[i].index] != BACKREF
1574 #ifdef MBS_SUPPORT
1575 && d->tokens[s->elems[i].index] != ANYCHAR
1576 && d->tokens[s->elems[i].index] != MBCSET
1577 #endif
1578 && d->tokens[s->elems[i].index] < CSET)
1580 old = s->elems[i];
1581 p.constraint = old.constraint;
1582 delete(s->elems[i], s);
1583 if (visited[old.index])
1585 --i;
1586 continue;
1588 visited[old.index] = 1;
1589 switch (d->tokens[old.index])
1591 case BEGLINE:
1592 p.constraint &= BEGLINE_CONSTRAINT;
1593 break;
1594 case ENDLINE:
1595 p.constraint &= ENDLINE_CONSTRAINT;
1596 break;
1597 case BEGWORD:
1598 p.constraint &= BEGWORD_CONSTRAINT;
1599 break;
1600 case ENDWORD:
1601 p.constraint &= ENDWORD_CONSTRAINT;
1602 break;
1603 case LIMWORD:
1604 p.constraint &= LIMWORD_CONSTRAINT;
1605 break;
1606 case NOTLIMWORD:
1607 p.constraint &= NOTLIMWORD_CONSTRAINT;
1608 break;
1609 default:
1610 break;
1612 for (j = 0; j < d->follows[old.index].nelem; ++j)
1614 p.index = d->follows[old.index].elems[j].index;
1615 insert(p, s);
1617 /* Force rescan to start at the beginning. */
1618 i = -1;
1621 free(visited);
1624 /* Perform bottom-up analysis on the parse tree, computing various functions.
1625 Note that at this point, we're pretending constructs like \< are real
1626 characters rather than constraints on what can follow them.
1628 Nullable: A node is nullable if it is at the root of a regexp that can
1629 match the empty string.
1630 * EMPTY leaves are nullable.
1631 * No other leaf is nullable.
1632 * A QMARK or STAR node is nullable.
1633 * A PLUS node is nullable if its argument is nullable.
1634 * A CAT node is nullable if both its arguments are nullable.
1635 * An OR node is nullable if either argument is nullable.
1637 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1638 that could correspond to the first character of a string matching the
1639 regexp rooted at the given node.
1640 * EMPTY leaves have empty firstpos.
1641 * The firstpos of a nonempty leaf is that leaf itself.
1642 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1643 argument.
1644 * The firstpos of a CAT node is the firstpos of the left argument, union
1645 the firstpos of the right if the left argument is nullable.
1646 * The firstpos of an OR node is the union of firstpos of each argument.
1648 Lastpos: The lastpos of a node is the set of positions that could
1649 correspond to the last character of a string matching the regexp at
1650 the given node.
1651 * EMPTY leaves have empty lastpos.
1652 * The lastpos of a nonempty leaf is that leaf itself.
1653 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1654 argument.
1655 * The lastpos of a CAT node is the lastpos of its right argument, union
1656 the lastpos of the left if the right argument is nullable.
1657 * The lastpos of an OR node is the union of the lastpos of each argument.
1659 Follow: The follow of a position is the set of positions that could
1660 correspond to the character following a character matching the node in
1661 a string matching the regexp. At this point we consider special symbols
1662 that match the empty string in some context to be just normal characters.
1663 Later, if we find that a special symbol is in a follow set, we will
1664 replace it with the elements of its follow, labeled with an appropriate
1665 constraint.
1666 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1667 the follow of every node in the lastpos.
1668 * Every node in the firstpos of the second argument of a CAT node is in
1669 the follow of every node in the lastpos of the first argument.
1671 Because of the postfix representation of the parse tree, the depth-first
1672 analysis is conveniently done by a linear scan with the aid of a stack.
1673 Sets are stored as arrays of the elements, obeying a stack-like allocation
1674 scheme; the number of elements in each set deeper in the stack can be
1675 used to determine the address of a particular set's array. */
1676 void
1677 dfaanalyze (struct dfa *d, int searchflag)
1679 int *nullable; /* Nullable stack. */
1680 int *nfirstpos; /* Element count stack for firstpos sets. */
1681 position *firstpos; /* Array where firstpos elements are stored. */
1682 int *nlastpos; /* Element count stack for lastpos sets. */
1683 position *lastpos; /* Array where lastpos elements are stored. */
1684 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1685 position_set tmp; /* Temporary set for merging sets. */
1686 position_set merged; /* Result of merging sets. */
1687 int wants_newline; /* True if some position wants newline info. */
1688 int *o_nullable;
1689 int *o_nfirst, *o_nlast;
1690 position *o_firstpos, *o_lastpos;
1691 int i, j;
1692 position *pos;
1694 #ifdef DEBUG
1695 fprintf(stderr, "dfaanalyze:\n");
1696 for (i = 0; i < d->tindex; ++i)
1698 fprintf(stderr, " %d:", i);
1699 prtok(d->tokens[i]);
1701 putc('\n', stderr);
1702 #endif
1704 d->searchflag = searchflag;
1706 MALLOC(nullable, int, d->depth);
1707 o_nullable = nullable;
1708 MALLOC(nfirstpos, int, d->depth);
1709 o_nfirst = nfirstpos;
1710 MALLOC(firstpos, position, d->nleaves);
1711 o_firstpos = firstpos, firstpos += d->nleaves;
1712 MALLOC(nlastpos, int, d->depth);
1713 o_nlast = nlastpos;
1714 MALLOC(lastpos, position, d->nleaves);
1715 o_lastpos = lastpos, lastpos += d->nleaves;
1716 CALLOC(nalloc, int, d->tindex);
1717 MALLOC(merged.elems, position, d->nleaves);
1719 CALLOC(d->follows, position_set, d->tindex);
1721 for (i = 0; i < d->tindex; ++i)
1722 #ifdef DEBUG
1723 { /* Nonsyntactic #ifdef goo... */
1724 #endif
1725 switch (d->tokens[i])
1727 case EMPTY:
1728 /* The empty set is nullable. */
1729 *nullable++ = 1;
1731 /* The firstpos and lastpos of the empty leaf are both empty. */
1732 *nfirstpos++ = *nlastpos++ = 0;
1733 break;
1735 case STAR:
1736 case PLUS:
1737 /* Every element in the firstpos of the argument is in the follow
1738 of every element in the lastpos. */
1739 tmp.nelem = nfirstpos[-1];
1740 tmp.elems = firstpos;
1741 pos = lastpos;
1742 for (j = 0; j < nlastpos[-1]; ++j)
1744 merge(&tmp, &d->follows[pos[j].index], &merged);
1745 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1746 nalloc[pos[j].index], merged.nelem - 1);
1747 copy(&merged, &d->follows[pos[j].index]);
1750 case QMARK:
1751 /* A QMARK or STAR node is automatically nullable. */
1752 if (d->tokens[i] != PLUS)
1753 nullable[-1] = 1;
1754 break;
1756 case CAT:
1757 /* Every element in the firstpos of the second argument is in the
1758 follow of every element in the lastpos of the first argument. */
1759 tmp.nelem = nfirstpos[-1];
1760 tmp.elems = firstpos;
1761 pos = lastpos + nlastpos[-1];
1762 for (j = 0; j < nlastpos[-2]; ++j)
1764 merge(&tmp, &d->follows[pos[j].index], &merged);
1765 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1766 nalloc[pos[j].index], merged.nelem - 1);
1767 copy(&merged, &d->follows[pos[j].index]);
1770 /* The firstpos of a CAT node is the firstpos of the first argument,
1771 union that of the second argument if the first is nullable. */
1772 if (nullable[-2])
1773 nfirstpos[-2] += nfirstpos[-1];
1774 else
1775 firstpos += nfirstpos[-1];
1776 --nfirstpos;
1778 /* The lastpos of a CAT node is the lastpos of the second argument,
1779 union that of the first argument if the second is nullable. */
1780 if (nullable[-1])
1781 nlastpos[-2] += nlastpos[-1];
1782 else
1784 pos = lastpos + nlastpos[-2];
1785 for (j = nlastpos[-1] - 1; j >= 0; --j)
1786 pos[j] = lastpos[j];
1787 lastpos += nlastpos[-2];
1788 nlastpos[-2] = nlastpos[-1];
1790 --nlastpos;
1792 /* A CAT node is nullable if both arguments are nullable. */
1793 nullable[-2] = nullable[-1] && nullable[-2];
1794 --nullable;
1795 break;
1797 case OR:
1798 case ORTOP:
1799 /* The firstpos is the union of the firstpos of each argument. */
1800 nfirstpos[-2] += nfirstpos[-1];
1801 --nfirstpos;
1803 /* The lastpos is the union of the lastpos of each argument. */
1804 nlastpos[-2] += nlastpos[-1];
1805 --nlastpos;
1807 /* An OR node is nullable if either argument is nullable. */
1808 nullable[-2] = nullable[-1] || nullable[-2];
1809 --nullable;
1810 break;
1812 default:
1813 /* Anything else is a nonempty position. (Note that special
1814 constructs like \< are treated as nonempty strings here;
1815 an "epsilon closure" effectively makes them nullable later.
1816 Backreferences have to get a real position so we can detect
1817 transitions on them later. But they are nullable. */
1818 *nullable++ = d->tokens[i] == BACKREF;
1820 /* This position is in its own firstpos and lastpos. */
1821 *nfirstpos++ = *nlastpos++ = 1;
1822 --firstpos, --lastpos;
1823 firstpos->index = lastpos->index = i;
1824 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1826 /* Allocate the follow set for this position. */
1827 nalloc[i] = 1;
1828 MALLOC(d->follows[i].elems, position, nalloc[i]);
1829 break;
1831 #ifdef DEBUG
1832 /* ... balance the above nonsyntactic #ifdef goo... */
1833 fprintf(stderr, "node %d:", i);
1834 prtok(d->tokens[i]);
1835 putc('\n', stderr);
1836 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1837 fprintf(stderr, " firstpos:");
1838 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1840 fprintf(stderr, " %d:", firstpos[j].index);
1841 prtok(d->tokens[firstpos[j].index]);
1843 fprintf(stderr, "\n lastpos:");
1844 for (j = nlastpos[-1] - 1; j >= 0; --j)
1846 fprintf(stderr, " %d:", lastpos[j].index);
1847 prtok(d->tokens[lastpos[j].index]);
1849 putc('\n', stderr);
1851 #endif
1853 /* For each follow set that is the follow set of a real position, replace
1854 it with its epsilon closure. */
1855 for (i = 0; i < d->tindex; ++i)
1856 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1857 #ifdef MBS_SUPPORT
1858 || d->tokens[i] == ANYCHAR
1859 || d->tokens[i] == MBCSET
1860 #endif
1861 || d->tokens[i] >= CSET)
1863 #ifdef DEBUG
1864 fprintf(stderr, "follows(%d:", i);
1865 prtok(d->tokens[i]);
1866 fprintf(stderr, "):");
1867 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1869 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1870 prtok(d->tokens[d->follows[i].elems[j].index]);
1872 putc('\n', stderr);
1873 #endif
1874 copy(&d->follows[i], &merged);
1875 epsclosure(&merged, d);
1876 if (d->follows[i].nelem < merged.nelem)
1877 REALLOC(d->follows[i].elems, position, merged.nelem);
1878 copy(&merged, &d->follows[i]);
1881 /* Get the epsilon closure of the firstpos of the regexp. The result will
1882 be the set of positions of state 0. */
1883 merged.nelem = 0;
1884 for (i = 0; i < nfirstpos[-1]; ++i)
1885 insert(firstpos[i], &merged);
1886 epsclosure(&merged, d);
1888 /* Check if any of the positions of state 0 will want newline context. */
1889 wants_newline = 0;
1890 for (i = 0; i < merged.nelem; ++i)
1891 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1892 wants_newline = 1;
1894 /* Build the initial state. */
1895 d->salloc = 1;
1896 d->sindex = 0;
1897 MALLOC(d->states, dfa_state, d->salloc);
1898 state_index(d, &merged, wants_newline, 0);
1900 free(o_nullable);
1901 free(o_nfirst);
1902 free(o_firstpos);
1903 free(o_nlast);
1904 free(o_lastpos);
1905 free(nalloc);
1906 free(merged.elems);
1909 /* Find, for each character, the transition out of state s of d, and store
1910 it in the appropriate slot of trans.
1912 We divide the positions of s into groups (positions can appear in more
1913 than one group). Each group is labeled with a set of characters that
1914 every position in the group matches (taking into account, if necessary,
1915 preceding context information of s). For each group, find the union
1916 of the its elements' follows. This set is the set of positions of the
1917 new state. For each character in the group's label, set the transition
1918 on this character to be to a state corresponding to the set's positions,
1919 and its associated backward context information, if necessary.
1921 If we are building a searching matcher, we include the positions of state
1922 0 in every state.
1924 The collection of groups is constructed by building an equivalence-class
1925 partition of the positions of s.
1927 For each position, find the set of characters C that it matches. Eliminate
1928 any characters from C that fail on grounds of backward context.
1930 Search through the groups, looking for a group whose label L has nonempty
1931 intersection with C. If L - C is nonempty, create a new group labeled
1932 L - C and having the same positions as the current group, and set L to
1933 the intersection of L and C. Insert the position in this group, set
1934 C = C - L, and resume scanning.
1936 If after comparing with every group there are characters remaining in C,
1937 create a new group labeled with the characters of C and insert this
1938 position in that group. */
1939 void
1940 dfastate (int s, struct dfa *d, int trans[])
1942 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1943 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1944 int ngrps = 0; /* Number of groups actually used. */
1945 position pos; /* Current position being considered. */
1946 charclass matches; /* Set of matching characters. */
1947 int matchesf; /* True if matches is nonempty. */
1948 charclass intersect; /* Intersection with some label set. */
1949 int intersectf; /* True if intersect is nonempty. */
1950 charclass leftovers; /* Stuff in the label that didn't match. */
1951 int leftoversf; /* True if leftovers is nonempty. */
1952 static charclass letters; /* Set of characters considered letters. */
1953 static charclass newline; /* Set of characters that aren't newline. */
1954 position_set follows; /* Union of the follows of some group. */
1955 position_set tmp; /* Temporary space for merging sets. */
1956 int state; /* New state. */
1957 int wants_newline; /* New state wants to know newline context. */
1958 int state_newline; /* New state on a newline transition. */
1959 int wants_letter; /* New state wants to know letter context. */
1960 int state_letter; /* New state on a letter transition. */
1961 static int initialized; /* Flag for static initialization. */
1962 #ifdef MBS_SUPPORT
1963 int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
1964 #endif
1965 int i, j, k;
1967 /* Initialize the set of letters, if necessary. */
1968 if (! initialized)
1970 initialized = 1;
1971 for (i = 0; i < NOTCHAR; ++i)
1972 if (IS_WORD_CONSTITUENT(i))
1973 setbit(i, letters);
1974 setbit(eolbyte, newline);
1977 zeroset(matches);
1979 for (i = 0; i < d->states[s].elems.nelem; ++i)
1981 pos = d->states[s].elems.elems[i];
1982 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1983 setbit(d->tokens[pos.index], matches);
1984 else if (d->tokens[pos.index] >= CSET)
1985 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1986 #ifdef MBS_SUPPORT
1987 else if (d->tokens[pos.index] == ANYCHAR
1988 || d->tokens[pos.index] == MBCSET)
1989 /* MB_CUR_MAX > 1 */
1991 /* ANYCHAR and MBCSET must match with a single character, so we
1992 must put it to d->states[s].mbps, which contains the positions
1993 which can match with a single character not a byte. */
1994 if (d->states[s].mbps.nelem == 0)
1996 MALLOC(d->states[s].mbps.elems, position,
1997 d->states[s].elems.nelem);
1999 insert(pos, &(d->states[s].mbps));
2000 continue;
2002 #endif /* MBS_SUPPORT */
2003 else
2004 continue;
2006 /* Some characters may need to be eliminated from matches because
2007 they fail in the current context. */
2008 if (pos.constraint != 0xFF)
2010 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2011 d->states[s].newline, 1))
2012 clrbit(eolbyte, matches);
2013 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2014 d->states[s].newline, 0))
2015 for (j = 0; j < CHARCLASS_INTS; ++j)
2016 matches[j] &= newline[j];
2017 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2018 d->states[s].letter, 1))
2019 for (j = 0; j < CHARCLASS_INTS; ++j)
2020 matches[j] &= ~letters[j];
2021 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2022 d->states[s].letter, 0))
2023 for (j = 0; j < CHARCLASS_INTS; ++j)
2024 matches[j] &= letters[j];
2026 /* If there are no characters left, there's no point in going on. */
2027 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2028 continue;
2029 if (j == CHARCLASS_INTS)
2030 continue;
2033 for (j = 0; j < ngrps; ++j)
2035 /* If matches contains a single character only, and the current
2036 group's label doesn't contain that character, go on to the
2037 next group. */
2038 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2039 && !tstbit(d->tokens[pos.index], labels[j]))
2040 continue;
2042 /* Check if this group's label has a nonempty intersection with
2043 matches. */
2044 intersectf = 0;
2045 for (k = 0; k < CHARCLASS_INTS; ++k)
2046 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2047 if (! intersectf)
2048 continue;
2050 /* It does; now find the set differences both ways. */
2051 leftoversf = matchesf = 0;
2052 for (k = 0; k < CHARCLASS_INTS; ++k)
2054 /* Even an optimizing compiler can't know this for sure. */
2055 int match = matches[k], label = labels[j][k];
2057 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2058 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2061 /* If there were leftovers, create a new group labeled with them. */
2062 if (leftoversf)
2064 copyset(leftovers, labels[ngrps]);
2065 copyset(intersect, labels[j]);
2066 MALLOC(grps[ngrps].elems, position, d->nleaves);
2067 copy(&grps[j], &grps[ngrps]);
2068 ++ngrps;
2071 /* Put the position in the current group. Note that there is no
2072 reason to call insert() here. */
2073 grps[j].elems[grps[j].nelem++] = pos;
2075 /* If every character matching the current position has been
2076 accounted for, we're done. */
2077 if (! matchesf)
2078 break;
2081 /* If we've passed the last group, and there are still characters
2082 unaccounted for, then we'll have to create a new group. */
2083 if (j == ngrps)
2085 copyset(matches, labels[ngrps]);
2086 zeroset(matches);
2087 MALLOC(grps[ngrps].elems, position, d->nleaves);
2088 grps[ngrps].nelem = 1;
2089 grps[ngrps].elems[0] = pos;
2090 ++ngrps;
2094 MALLOC(follows.elems, position, d->nleaves);
2095 MALLOC(tmp.elems, position, d->nleaves);
2097 /* If we are a searching matcher, the default transition is to a state
2098 containing the positions of state 0, otherwise the default transition
2099 is to fail miserably. */
2100 if (d->searchflag)
2102 wants_newline = 0;
2103 wants_letter = 0;
2104 for (i = 0; i < d->states[0].elems.nelem; ++i)
2106 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2107 wants_newline = 1;
2108 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2109 wants_letter = 1;
2111 copy(&d->states[0].elems, &follows);
2112 state = state_index(d, &follows, 0, 0);
2113 if (wants_newline)
2114 state_newline = state_index(d, &follows, 1, 0);
2115 else
2116 state_newline = state;
2117 if (wants_letter)
2118 state_letter = state_index(d, &follows, 0, 1);
2119 else
2120 state_letter = state;
2121 for (i = 0; i < NOTCHAR; ++i)
2122 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2123 trans[eolbyte] = state_newline;
2125 else
2126 for (i = 0; i < NOTCHAR; ++i)
2127 trans[i] = -1;
2129 for (i = 0; i < ngrps; ++i)
2131 follows.nelem = 0;
2133 /* Find the union of the follows of the positions of the group.
2134 This is a hideously inefficient loop. Fix it someday. */
2135 for (j = 0; j < grps[i].nelem; ++j)
2136 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2137 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2139 #ifdef MBS_SUPPORT
2140 if (d->mb_cur_max > 1)
2142 /* If a token in follows.elems is not 1st byte of a multibyte
2143 character, or the states of follows must accept the bytes
2144 which are not 1st byte of the multibyte character.
2145 Then, if a state of follows encounter a byte, it must not be
2146 a 1st byte of a multibyte character nor single byte character.
2147 We cansel to add state[0].follows to next state, because
2148 state[0] must accept 1st-byte
2150 For example, we assume <sb a> is a certain single byte
2151 character, <mb A> is a certain multibyte character, and the
2152 codepoint of <sb a> equals the 2nd byte of the codepoint of
2153 <mb A>.
2154 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2155 by accepting accepts 1st byte of <mb A>, and state[i+1]
2156 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2157 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2158 <mb A>, so we can not add state[0]. */
2160 next_isnt_1st_byte = 0;
2161 for (j = 0; j < follows.nelem; ++j)
2163 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2165 next_isnt_1st_byte = 1;
2166 break;
2170 #endif
2172 /* If we are building a searching matcher, throw in the positions
2173 of state 0 as well. */
2174 #ifdef MBS_SUPPORT
2175 if (d->searchflag && (d->mb_cur_max == 1 || !next_isnt_1st_byte))
2176 #else
2177 if (d->searchflag)
2178 #endif
2179 for (j = 0; j < d->states[0].elems.nelem; ++j)
2180 insert(d->states[0].elems.elems[j], &follows);
2182 /* Find out if the new state will want any context information. */
2183 wants_newline = 0;
2184 if (tstbit(eolbyte, labels[i]))
2185 for (j = 0; j < follows.nelem; ++j)
2186 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2187 wants_newline = 1;
2189 wants_letter = 0;
2190 for (j = 0; j < CHARCLASS_INTS; ++j)
2191 if (labels[i][j] & letters[j])
2192 break;
2193 if (j < CHARCLASS_INTS)
2194 for (j = 0; j < follows.nelem; ++j)
2195 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2196 wants_letter = 1;
2198 /* Find the state(s) corresponding to the union of the follows. */
2199 state = state_index(d, &follows, 0, 0);
2200 if (wants_newline)
2201 state_newline = state_index(d, &follows, 1, 0);
2202 else
2203 state_newline = state;
2204 if (wants_letter)
2205 state_letter = state_index(d, &follows, 0, 1);
2206 else
2207 state_letter = state;
2209 /* Set the transitions for each character in the current label. */
2210 for (j = 0; j < CHARCLASS_INTS; ++j)
2211 for (k = 0; k < INTBITS; ++k)
2212 if (labels[i][j] & 1 << k)
2214 int c = j * INTBITS + k;
2216 if (c == eolbyte)
2217 trans[c] = state_newline;
2218 else if (IS_WORD_CONSTITUENT(c))
2219 trans[c] = state_letter;
2220 else if (c < NOTCHAR)
2221 trans[c] = state;
2225 for (i = 0; i < ngrps; ++i)
2226 free(grps[i].elems);
2227 free(follows.elems);
2228 free(tmp.elems);
2231 /* Some routines for manipulating a compiled dfa's transition tables.
2232 Each state may or may not have a transition table; if it does, and it
2233 is a non-accepting state, then d->trans[state] points to its table.
2234 If it is an accepting state then d->fails[state] points to its table.
2235 If it has no table at all, then d->trans[state] is NULL.
2236 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2238 static void
2239 build_state (int s, struct dfa *d)
2241 int *trans; /* The new transition table. */
2242 int i;
2244 /* Set an upper limit on the number of transition tables that will ever
2245 exist at once. 1024 is arbitrary. The idea is that the frequently
2246 used transition tables will be quickly rebuilt, whereas the ones that
2247 were only needed once or twice will be cleared away. */
2248 if (d->trcount >= 1024)
2250 for (i = 0; i < d->tralloc; ++i)
2252 free(d->trans[i]);
2253 free(d->fails[i]);
2254 d->trans[i] = d->fails[i] = NULL;
2256 d->trcount = 0;
2259 ++d->trcount;
2261 /* Set up the success bits for this state. */
2262 d->success[s] = 0;
2263 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2264 s, *d))
2265 d->success[s] |= 4;
2266 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2267 s, *d))
2268 d->success[s] |= 2;
2269 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2270 s, *d))
2271 d->success[s] |= 1;
2273 MALLOC(trans, int, NOTCHAR);
2274 dfastate(s, d, trans);
2276 /* Now go through the new transition table, and make sure that the trans
2277 and fail arrays are allocated large enough to hold a pointer for the
2278 largest state mentioned in the table. */
2279 for (i = 0; i < NOTCHAR; ++i)
2280 if (trans[i] >= d->tralloc)
2282 int oldalloc = d->tralloc;
2284 while (trans[i] >= d->tralloc)
2285 d->tralloc *= 2;
2286 REALLOC(d->realtrans, int *, d->tralloc + 1);
2287 d->trans = d->realtrans + 1;
2288 REALLOC(d->fails, int *, d->tralloc);
2289 REALLOC(d->success, int, d->tralloc);
2290 REALLOC(d->newlines, int, d->tralloc);
2291 while (oldalloc < d->tralloc)
2293 d->trans[oldalloc] = NULL;
2294 d->fails[oldalloc++] = NULL;
2298 /* Keep the newline transition in a special place so we can use it as
2299 a sentinel. */
2300 d->newlines[s] = trans[eolbyte];
2301 trans[eolbyte] = -1;
2303 if (ACCEPTING(s, *d))
2304 d->fails[s] = trans;
2305 else
2306 d->trans[s] = trans;
2309 static void
2310 build_state_zero (struct dfa *d)
2312 d->tralloc = 1;
2313 d->trcount = 0;
2314 CALLOC(d->realtrans, int *, d->tralloc + 1);
2315 d->trans = d->realtrans + 1;
2316 CALLOC(d->fails, int *, d->tralloc);
2317 MALLOC(d->success, int, d->tralloc);
2318 MALLOC(d->newlines, int, d->tralloc);
2319 build_state(0, d);
2322 #ifdef MBS_SUPPORT
2323 /* Multibyte character handling sub-routines for dfaexec. */
2325 /* Initial state may encounter the byte which is not a single byte character
2326 nor 1st byte of a multibyte character. But it is incorrect for initial
2327 state to accept such a byte.
2328 For example, in sjis encoding the regular expression like "\\" accepts
2329 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2330 0x815c. Then Initial state must skip the bytes which are not a single byte
2331 character nor 1st byte of a multibyte character. */
2332 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2333 if (s == 0) \
2335 while (inputwcs[p - buf_begin] == 0 \
2336 && mblen_buf[p - buf_begin] > 0 \
2337 && (unsigned char const *) p < buf_end) \
2338 ++p; \
2339 if ((char *) p >= end) \
2341 free(mblen_buf); \
2342 free(inputwcs); \
2343 *end = saved_end; \
2344 return NULL; \
2348 static void
2349 realloc_trans_if_necessary(struct dfa *d, int new_state)
2351 /* Make sure that the trans and fail arrays are allocated large enough
2352 to hold a pointer for the new state. */
2353 if (new_state >= d->tralloc)
2355 int oldalloc = d->tralloc;
2357 while (new_state >= d->tralloc)
2358 d->tralloc *= 2;
2359 REALLOC(d->realtrans, int *, d->tralloc + 1);
2360 d->trans = d->realtrans + 1;
2361 REALLOC(d->fails, int *, d->tralloc);
2362 REALLOC(d->success, int, d->tralloc);
2363 REALLOC(d->newlines, int, d->tralloc);
2364 while (oldalloc < d->tralloc)
2366 d->trans[oldalloc] = NULL;
2367 d->fails[oldalloc++] = NULL;
2372 /* Return values of transit_state_singlebyte(), and
2373 transit_state_consume_1char. */
2374 typedef enum
2376 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2377 TRANSIT_STATE_DONE, /* State transition has finished. */
2378 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2379 } status_transit_state;
2381 /* Consume a single byte and transit state from 's' to '*next_state'.
2382 This function is almost same as the state transition routin in dfaexec().
2383 But state transition is done just once, otherwise matching succeed or
2384 reach the end of the buffer. */
2385 static status_transit_state
2386 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2387 int *next_state)
2389 int *t;
2390 int works = s;
2392 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2394 while (rval == TRANSIT_STATE_IN_PROGRESS)
2396 if ((t = d->trans[works]) != NULL)
2398 works = t[*p];
2399 rval = TRANSIT_STATE_DONE;
2400 if (works < 0)
2401 works = 0;
2403 else if (works < 0)
2405 if (p == buf_end)
2407 /* At the moment, it must not happen. */
2408 abort ();
2410 works = 0;
2412 else if (d->fails[works])
2414 works = d->fails[works][*p];
2415 rval = TRANSIT_STATE_DONE;
2417 else
2419 build_state(works, d);
2422 *next_state = works;
2423 return rval;
2426 /* Check whether period can match or not in the current context. If it can,
2427 return the amount of the bytes with which period can match, otherwise
2428 return 0.
2429 `pos' is the position of the period. `idx' is the index from the
2430 buf_begin, and it is the current position in the buffer. */
2431 static int
2432 match_anychar (struct dfa *d, int s, position pos, int idx)
2434 int newline = 0;
2435 int letter = 0;
2436 wchar_t wc;
2437 int mbclen;
2439 wc = inputwcs[idx];
2440 mbclen = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
2442 /* Check context. */
2443 if (wc == (wchar_t)eolbyte)
2445 if (!(syntax_bits & RE_DOT_NEWLINE))
2446 return 0;
2447 newline = 1;
2449 else if (wc == (wchar_t)'\0')
2451 if (syntax_bits & RE_DOT_NOT_NULL)
2452 return 0;
2453 newline = 1;
2456 if (iswalnum(wc) || wc == L'_')
2457 letter = 1;
2459 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2460 newline, d->states[s].letter, letter))
2461 return 0;
2463 return mbclen;
2466 /* Check whether bracket expression can match or not in the current context.
2467 If it can, return the amount of the bytes with which expression can match,
2468 otherwise return 0.
2469 `pos' is the position of the bracket expression. `idx' is the index
2470 from the buf_begin, and it is the current position in the buffer. */
2471 static int
2472 match_mb_charset (struct dfa *d, int s, position pos, int idx)
2474 int i;
2475 int match; /* Flag which represent that matching succeed. */
2476 int match_len; /* Length of the character (or collating element)
2477 with which this operator match. */
2478 int op_len; /* Length of the operator. */
2479 char buffer[128];
2480 wchar_t wcbuf[6];
2482 /* Pointer to the structure to which we are currently refering. */
2483 struct mb_char_classes *work_mbc;
2485 int newline = 0;
2486 int letter = 0;
2487 wchar_t wc; /* Current refering character. */
2489 wc = inputwcs[idx];
2491 /* Check context. */
2492 if (wc == (wchar_t)eolbyte)
2494 if (!(syntax_bits & RE_DOT_NEWLINE))
2495 return 0;
2496 newline = 1;
2498 else if (wc == (wchar_t)'\0')
2500 if (syntax_bits & RE_DOT_NOT_NULL)
2501 return 0;
2502 newline = 1;
2504 if (iswalnum(wc) || wc == L'_')
2505 letter = 1;
2506 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2507 newline, d->states[s].letter, letter))
2508 return 0;
2510 /* Assign the current refering operator to work_mbc. */
2511 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2512 match = !work_mbc->invert;
2513 match_len = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
2515 /* Match in range 0-255? */
2516 if (wc < NOTCHAR && work_mbc->cset != -1
2517 && tstbit((unsigned char)wc, d->charclasses[work_mbc->cset]))
2518 goto charset_matched;
2520 /* match with a character class? */
2521 for (i = 0; i<work_mbc->nch_classes; i++)
2523 if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2524 goto charset_matched;
2527 strncpy(buffer, (char const *) buf_begin + idx, match_len);
2528 buffer[match_len] = '\0';
2530 /* match with an equivalent class? */
2531 for (i = 0; i<work_mbc->nequivs; i++)
2533 op_len = strlen(work_mbc->equivs[i]);
2534 strncpy(buffer, (char const *) buf_begin + idx, op_len);
2535 buffer[op_len] = '\0';
2536 if (strcoll(work_mbc->equivs[i], buffer) == 0)
2538 match_len = op_len;
2539 goto charset_matched;
2543 /* match with a collating element? */
2544 for (i = 0; i<work_mbc->ncoll_elems; i++)
2546 op_len = strlen(work_mbc->coll_elems[i]);
2547 strncpy(buffer, (char const *) buf_begin + idx, op_len);
2548 buffer[op_len] = '\0';
2550 if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2552 match_len = op_len;
2553 goto charset_matched;
2557 wcbuf[0] = wc;
2558 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2560 /* match with a range? */
2561 for (i = 0; i<work_mbc->nranges; i++)
2563 wcbuf[2] = work_mbc->range_sts[i];
2564 wcbuf[4] = work_mbc->range_ends[i];
2566 if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2567 wcscoll(wcbuf+4, wcbuf) >= 0)
2568 goto charset_matched;
2571 /* match with a character? */
2572 for (i = 0; i<work_mbc->nchars; i++)
2574 if (wc == work_mbc->chars[i])
2575 goto charset_matched;
2578 match = !match;
2580 charset_matched:
2581 return match ? match_len : 0;
2584 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2585 array which corresponds to `d->states[s].mbps.elem' and each element of
2586 the array contains the amount of the bytes with which the element can
2587 match.
2588 `idx' is the index from the buf_begin, and it is the current position
2589 in the buffer.
2590 Caller MUST free the array which this function return. */
2591 static int*
2592 check_matching_with_multibyte_ops (struct dfa *d, int s, int idx)
2594 int i;
2595 int* rarray;
2597 MALLOC(rarray, int, d->states[s].mbps.nelem);
2598 for (i = 0; i < d->states[s].mbps.nelem; ++i)
2600 position pos = d->states[s].mbps.elems[i];
2601 switch(d->tokens[pos.index])
2603 case ANYCHAR:
2604 rarray[i] = match_anychar(d, s, pos, idx);
2605 break;
2606 case MBCSET:
2607 rarray[i] = match_mb_charset(d, s, pos, idx);
2608 break;
2609 default:
2610 break; /* can not happen. */
2613 return rarray;
2616 /* Consume a single character and enumerate all of the positions which can
2617 be next position from the state `s'.
2618 `match_lens' is the input. It can be NULL, but it can also be the output
2619 of check_matching_with_multibyte_ops() for optimization.
2620 `mbclen' and `pps' are the output. `mbclen' is the length of the
2621 character consumed, and `pps' is the set this function enumerate. */
2622 static status_transit_state
2623 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2624 int *match_lens, int *mbclen, position_set *pps)
2626 int i, j;
2627 int s1, s2;
2628 int* work_mbls;
2629 status_transit_state rs = TRANSIT_STATE_DONE;
2631 /* Calculate the length of the (single/multi byte) character
2632 to which p points. */
2633 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2634 : mblen_buf[*pp - buf_begin];
2636 /* Calculate the state which can be reached from the state `s' by
2637 consuming `*mbclen' single bytes from the buffer. */
2638 s1 = s;
2639 for (i = 0; i < *mbclen; i++)
2641 s2 = s1;
2642 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2644 /* Copy the positions contained by `s1' to the set `pps'. */
2645 copy(&(d->states[s1].elems), pps);
2647 /* Check (inputed)match_lens, and initialize if it is NULL. */
2648 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2649 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2650 else
2651 work_mbls = match_lens;
2653 /* Add all of the positions which can be reached from `s' by consuming
2654 a single character. */
2655 for (i = 0; i < d->states[s].mbps.nelem ; i++)
2657 if (work_mbls[i] == *mbclen)
2658 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2659 j++)
2660 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2661 pps);
2664 if (match_lens == NULL && work_mbls != NULL)
2665 free(work_mbls);
2666 return rs;
2669 /* Transit state from s, then return new state and update the pointer of the
2670 buffer. This function is for some operator which can match with a multi-
2671 byte character or a collating element (which may be multi characters). */
2672 static int
2673 transit_state (struct dfa *d, int s, unsigned char const **pp)
2675 int s1;
2676 int mbclen; /* The length of current input multibyte character. */
2677 int maxlen = 0;
2678 int i, j;
2679 int *match_lens = NULL;
2680 int nelem = d->states[s].mbps.nelem; /* Just a alias. */
2681 position_set follows;
2682 unsigned char const *p1 = *pp;
2683 wchar_t wc;
2685 if (nelem > 0)
2686 /* This state has (a) multibyte operator(s).
2687 We check whether each of them can match or not. */
2689 /* Note: caller must free the return value of this function. */
2690 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2692 for (i = 0; i < nelem; i++)
2693 /* Search the operator which match the longest string,
2694 in this state. */
2696 if (match_lens[i] > maxlen)
2697 maxlen = match_lens[i];
2701 if (nelem == 0 || maxlen == 0)
2702 /* This state has no multibyte operator which can match.
2703 We need to check only one single byte character. */
2705 status_transit_state rs;
2706 rs = transit_state_singlebyte(d, s, *pp, &s1);
2708 /* We must update the pointer if state transition succeeded. */
2709 if (rs == TRANSIT_STATE_DONE)
2710 ++*pp;
2712 free(match_lens);
2713 return s1;
2716 /* This state has some operators which can match a multibyte character. */
2717 follows.nelem = 0;
2718 MALLOC(follows.elems, position, d->nleaves);
2720 /* `maxlen' may be longer than the length of a character, because it may
2721 not be a character but a (multi character) collating element.
2722 We enumerate all of the positions which `s' can reach by consuming
2723 `maxlen' bytes. */
2724 transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2726 wc = inputwcs[*pp - mbclen - buf_begin];
2727 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2728 realloc_trans_if_necessary(d, s1);
2730 while (*pp - p1 < maxlen)
2732 follows.nelem = 0;
2733 transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2735 for (i = 0; i < nelem ; i++)
2737 if (match_lens[i] == *pp - p1)
2738 for (j = 0;
2739 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2740 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2741 &follows);
2744 wc = inputwcs[*pp - mbclen - buf_begin];
2745 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2746 realloc_trans_if_necessary(d, s1);
2748 free(match_lens);
2749 free(follows.elems);
2750 return s1;
2753 #endif /* MBS_SUPPORT */
2755 /* Search through a buffer looking for a match to the given struct dfa.
2756 Find the first occurrence of a string matching the regexp in the
2757 buffer, and the shortest possible version thereof. Return a pointer to
2758 the first character after the match, or NULL if none is found. BEGIN
2759 points to the beginning of the buffer, and END points to the first byte
2760 after its end. Note however that we store a sentinel byte (usually
2761 newline) in *END, so the actual buffer must be one byte longer.
2762 When NEWLINE is nonzero, newlines may appear in the matching string.
2763 If COUNT is non-NULL, increment *COUNT once for each newline processed.
2764 Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
2765 encountered a back-reference (1) or not (0). The caller may use this
2766 to decide whether to fall back on a backtracking matcher. */
2767 char *
2768 dfaexec (struct dfa *d, char const *begin, char *end,
2769 int newline, int *count, int *backref)
2771 int s, s1, tmp; /* Current state. */
2772 unsigned char const *p; /* Current input character. */
2773 int **trans, *t; /* Copy of d->trans so it can be optimized
2774 into a register. */
2775 unsigned char eol = eolbyte; /* Likewise for eolbyte. */
2776 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
2777 static int sbit_init;
2779 if (! sbit_init)
2781 int i;
2783 sbit_init = 1;
2784 for (i = 0; i < NOTCHAR; ++i)
2785 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2786 sbit[eol] = 4;
2789 if (! d->tralloc)
2790 build_state_zero(d);
2792 s = s1 = 0;
2793 p = (unsigned char const *) begin;
2794 trans = d->trans;
2795 unsigned char saved_end = *(unsigned char *) end;
2796 *end = eol;
2798 #ifdef MBS_SUPPORT
2799 if (d->mb_cur_max > 1)
2801 int remain_bytes, i;
2802 buf_begin = (unsigned char *) begin;
2803 buf_end = (unsigned char *) end;
2805 /* initialize mblen_buf, and inputwcs. */
2806 MALLOC(mblen_buf, unsigned char, end - begin + 2);
2807 MALLOC(inputwcs, wchar_t, end - begin + 2);
2808 memset(&mbs, 0, sizeof(mbstate_t));
2809 remain_bytes = 0;
2810 for (i = 0; i < end - begin + 1; i++)
2812 if (remain_bytes == 0)
2814 remain_bytes
2815 = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
2816 if (remain_bytes < 1
2817 || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
2819 remain_bytes = 0;
2820 inputwcs[i] = (wchar_t)begin[i];
2821 mblen_buf[i] = 0;
2823 else
2825 mblen_buf[i] = remain_bytes;
2826 remain_bytes--;
2829 else
2831 mblen_buf[i] = remain_bytes;
2832 inputwcs[i] = 0;
2833 remain_bytes--;
2836 mblen_buf[i] = 0;
2837 inputwcs[i] = 0; /* sentinel */
2839 #endif /* MBS_SUPPORT */
2841 for (;;)
2843 #ifdef MBS_SUPPORT
2844 if (d->mb_cur_max > 1)
2845 while ((t = trans[s]))
2847 if ((char *) p > end)
2848 break;
2849 s1 = s;
2850 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2852 if (d->states[s].mbps.nelem == 0)
2854 s = t[*p++];
2855 continue;
2858 /* Can match with a multibyte character (and multi character
2859 collating element). Transition table might be updated. */
2860 s = transit_state(d, s, &p);
2861 trans = d->trans;
2863 else
2864 #endif /* MBS_SUPPORT */
2865 while ((t = trans[s]) != 0) { /* hand-optimized loop */
2866 s1 = t[*p++];
2867 if ((t = trans[s1]) == 0) {
2868 tmp = s ; s = s1 ; s1 = tmp ; /* swap */
2869 break;
2871 s = t[*p++];
2874 if (s >= 0 && (char *) p <= end && d->fails[s])
2876 if (d->success[s] & sbit[*p])
2878 if (backref)
2879 *backref = (d->states[s].backref != 0);
2880 #ifdef MBS_SUPPORT
2881 if (d->mb_cur_max > 1)
2883 free(mblen_buf);
2884 free(inputwcs);
2886 #endif /* MBS_SUPPORT */
2887 *end = saved_end;
2888 return (char *) p;
2891 s1 = s;
2892 #ifdef MBS_SUPPORT
2893 if (d->mb_cur_max > 1)
2895 /* Can match with a multibyte character (and multicharacter
2896 collating element). Transition table might be updated. */
2897 s = transit_state(d, s, &p);
2898 trans = d->trans;
2900 else
2901 #endif /* MBS_SUPPORT */
2902 s = d->fails[s][*p++];
2903 continue;
2906 /* If the previous character was a newline, count it. */
2907 if (count && (char *) p <= end && p[-1] == eol)
2908 ++*count;
2910 /* Check if we've run off the end of the buffer. */
2911 if ((char *) p > end)
2913 #ifdef MBS_SUPPORT
2914 if (d->mb_cur_max > 1)
2916 free(mblen_buf);
2917 free(inputwcs);
2919 #endif /* MBS_SUPPORT */
2920 *end = saved_end;
2921 return NULL;
2924 if (s >= 0)
2926 build_state(s, d);
2927 trans = d->trans;
2928 continue;
2931 if (p[-1] == eol && newline)
2933 s = d->newlines[s1];
2934 continue;
2937 s = 0;
2941 #ifdef MBS_SUPPORT
2942 static void
2943 free_mbdata (struct dfa *d)
2945 unsigned int i;
2947 free(d->multibyte_prop);
2948 d->multibyte_prop = NULL;
2950 for (i = 0; i < d->nmbcsets; ++i)
2952 unsigned int j;
2953 struct mb_char_classes *p = &(d->mbcsets[i]);
2954 free(p->chars);
2955 free(p->ch_classes);
2956 free(p->range_sts);
2957 free(p->range_ends);
2959 for (j = 0; j < p->nequivs; ++j)
2960 free(p->equivs[j]);
2961 free(p->equivs);
2963 for (j = 0; j < p->ncoll_elems; ++j)
2964 free(p->coll_elems[j]);
2965 free(p->coll_elems);
2968 free(d->mbcsets);
2969 d->mbcsets = NULL;
2970 d->nmbcsets = 0;
2972 #endif
2974 /* Initialize the components of a dfa that the other routines don't
2975 initialize for themselves. */
2976 void
2977 dfainit (struct dfa *d)
2979 d->calloc = 1;
2980 MALLOC(d->charclasses, charclass, d->calloc);
2981 d->cindex = 0;
2983 d->talloc = 1;
2984 MALLOC(d->tokens, token, d->talloc);
2985 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2987 #ifdef MBS_SUPPORT
2988 d->mb_cur_max = MB_CUR_MAX;
2989 if (d->mb_cur_max > 1)
2991 d->nmultibyte_prop = 1;
2992 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2993 d->nmbcsets = 0;
2994 d->mbcsets_alloc = 1;
2995 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2997 #endif
2999 d->searchflag = 0;
3000 d->tralloc = 0;
3002 d->musts = 0;
3003 d->realtrans = 0;
3004 d->fails = 0;
3005 d->newlines = 0;
3006 d->success = 0;
3007 #ifdef GAWK
3008 d->broken = 0;
3009 #endif
3012 #ifdef MBS_SUPPORT
3013 static void
3014 dfaoptimize (struct dfa *d)
3016 unsigned int i;
3017 if (!using_utf8())
3018 return;
3020 for (i = 0; i < d->tindex; ++i)
3022 switch(d->tokens[i])
3024 case ANYCHAR:
3025 case MBCSET:
3026 /* Requires multi-byte algorithm. */
3027 return;
3028 default:
3029 break;
3033 free_mbdata (d);
3034 d->mb_cur_max = 1;
3036 #endif
3038 /* Parse and analyze a single string of the given length. */
3039 void
3040 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3042 dfainit(d);
3043 dfaparse(s, len, d);
3044 dfamust(d);
3045 #ifdef MBS_SUPPORT
3046 dfaoptimize(d);
3047 #endif
3048 dfaanalyze(d, searchflag);
3051 /* Free the storage held by the components of a dfa. */
3052 void
3053 dfafree (struct dfa *d)
3055 int i;
3056 struct dfamust *dm, *ndm;
3058 free(d->charclasses);
3059 free(d->tokens);
3061 #ifdef MBS_SUPPORT
3062 if (d->mb_cur_max > 1)
3063 free_mbdata(d);
3064 #endif /* MBS_SUPPORT */
3066 for (i = 0; i < d->sindex; ++i) {
3067 free(d->states[i].elems.elems);
3068 #ifdef MBS_SUPPORT
3069 free(d->states[i].mbps.elems);
3070 #endif /* MBS_SUPPORT */
3072 free(d->states);
3073 for (i = 0; i < d->tindex; ++i)
3074 free(d->follows[i].elems);
3075 free(d->follows);
3076 for (i = 0; i < d->tralloc; ++i)
3078 free(d->trans[i]);
3079 free(d->fails[i]);
3081 free(d->realtrans);
3082 free(d->fails);
3083 free(d->newlines);
3084 free(d->success);
3085 for (dm = d->musts; dm; dm = ndm)
3087 ndm = dm->next;
3088 free(dm->must);
3089 free(dm);
3093 /* Having found the postfix representation of the regular expression,
3094 try to find a long sequence of characters that must appear in any line
3095 containing the r.e.
3096 Finding a "longest" sequence is beyond the scope here;
3097 we take an easy way out and hope for the best.
3098 (Take "(ab|a)b"--please.)
3100 We do a bottom-up calculation of sequences of characters that must appear
3101 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3102 representation:
3103 sequences that must appear at the left of the match ("left")
3104 sequences that must appear at the right of the match ("right")
3105 lists of sequences that must appear somewhere in the match ("in")
3106 sequences that must constitute the match ("is")
3108 When we get to the root of the tree, we use one of the longest of its
3109 calculated "in" sequences as our answer. The sequence we find is returned in
3110 d->must (where "d" is the single argument passed to "dfamust");
3111 the length of the sequence is returned in d->mustn.
3113 The sequences calculated for the various types of node (in pseudo ANSI c)
3114 are shown below. "p" is the operand of unary operators (and the left-hand
3115 operand of binary operators); "q" is the right-hand operand of binary
3116 operators.
3118 "ZERO" means "a zero-length sequence" below.
3120 Type left right is in
3121 ---- ---- ----- -- --
3122 char c # c # c # c # c
3124 ANYCHAR ZERO ZERO ZERO ZERO
3126 MBCSET ZERO ZERO ZERO ZERO
3128 CSET ZERO ZERO ZERO ZERO
3130 STAR ZERO ZERO ZERO ZERO
3132 QMARK ZERO ZERO ZERO ZERO
3134 PLUS p->left p->right ZERO p->in
3136 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3137 p->left : q->right : q->is!=ZERO) ? q->in plus
3138 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3139 ZERO
3141 OR longest common longest common (do p->is and substrings common to
3142 leading trailing q->is have same p->in and q->in
3143 (sub)sequence (sub)sequence length and
3144 of p->left of p->right content) ?
3145 and q->left and q->right p->is : NULL
3147 If there's anything else we recognize in the tree, all four sequences get set
3148 to zero-length sequences. If there's something we don't recognize in the tree,
3149 we just return a zero-length sequence.
3151 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3152 'aaa')?
3154 And. . .is it here or someplace that we might ponder "optimizations" such as
3155 egrep 'psi|epsilon' -> egrep 'psi'
3156 egrep 'pepsi|epsilon' -> egrep 'epsi'
3157 (Yes, we now find "epsi" as a "string
3158 that must occur", but we might also
3159 simplify the *entire* r.e. being sought)
3160 grep '[c]' -> grep 'c'
3161 grep '(ab|a)b' -> grep 'ab'
3162 grep 'ab*' -> grep 'a'
3163 grep 'a*b' -> grep 'b'
3165 There are several issues:
3167 Is optimization easy (enough)?
3169 Does optimization actually accomplish anything,
3170 or is the automaton you get from "psi|epsilon" (for example)
3171 the same as the one you get from "psi" (for example)?
3173 Are optimizable r.e.'s likely to be used in real-life situations
3174 (something like 'ab*' is probably unlikely; something like is
3175 'psi|epsilon' is likelier)? */
3177 static char *
3178 icatalloc (char *old, char *new)
3180 char *result;
3181 size_t oldsize, newsize;
3183 newsize = (new == NULL) ? 0 : strlen(new);
3184 if (old == NULL)
3185 oldsize = 0;
3186 else if (newsize == 0)
3187 return old;
3188 else oldsize = strlen(old);
3189 if (old == NULL)
3190 result = (char *) malloc(newsize + 1);
3191 else
3192 result = (char *) realloc((void *) old, oldsize + newsize + 1);
3193 if (result != NULL && new != NULL)
3194 (void) strcpy(result + oldsize, new);
3195 return result;
3198 static char *
3199 icpyalloc (char *string)
3201 return icatalloc((char *) NULL, string);
3204 static char *
3205 istrstr (char *lookin, char *lookfor)
3207 char *cp;
3208 size_t len;
3210 len = strlen(lookfor);
3211 for (cp = lookin; *cp != '\0'; ++cp)
3212 if (strncmp(cp, lookfor, len) == 0)
3213 return cp;
3214 return NULL;
3217 static void
3218 freelist (char **cpp)
3220 int i;
3222 if (cpp == NULL)
3223 return;
3224 for (i = 0; cpp[i] != NULL; ++i)
3226 free(cpp[i]);
3227 cpp[i] = NULL;
3231 static char **
3232 enlist (char **cpp, char *new, size_t len)
3234 int i, j;
3236 if (cpp == NULL)
3237 return NULL;
3238 if ((new = icpyalloc(new)) == NULL)
3240 freelist(cpp);
3241 return NULL;
3243 new[len] = '\0';
3244 /* Is there already something in the list that's new (or longer)? */
3245 for (i = 0; cpp[i] != NULL; ++i)
3246 if (istrstr(cpp[i], new) != NULL)
3248 free(new);
3249 return cpp;
3251 /* Eliminate any obsoleted strings. */
3252 j = 0;
3253 while (cpp[j] != NULL)
3254 if (istrstr(new, cpp[j]) == NULL)
3255 ++j;
3256 else
3258 free(cpp[j]);
3259 if (--i == j)
3260 break;
3261 cpp[j] = cpp[i];
3262 cpp[i] = NULL;
3264 /* Add the new string. */
3265 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3266 if (cpp == NULL)
3267 return NULL;
3268 cpp[i] = new;
3269 cpp[i + 1] = NULL;
3270 return cpp;
3273 /* Given pointers to two strings, return a pointer to an allocated
3274 list of their distinct common substrings. Return NULL if something
3275 seems wild. */
3276 static char **
3277 comsubs (char *left, char *right)
3279 char **cpp;
3280 char *lcp;
3281 char *rcp;
3282 size_t i, len;
3284 if (left == NULL || right == NULL)
3285 return NULL;
3286 cpp = (char **) malloc(sizeof *cpp);
3287 if (cpp == NULL)
3288 return NULL;
3289 cpp[0] = NULL;
3290 for (lcp = left; *lcp != '\0'; ++lcp)
3292 len = 0;
3293 rcp = strchr (right, *lcp);
3294 while (rcp != NULL)
3296 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3297 continue;
3298 if (i > len)
3299 len = i;
3300 rcp = strchr (rcp + 1, *lcp);
3302 if (len == 0)
3303 continue;
3304 if ((cpp = enlist(cpp, lcp, len)) == NULL)
3305 break;
3307 return cpp;
3310 static char **
3311 addlists (char **old, char **new)
3313 int i;
3315 if (old == NULL || new == NULL)
3316 return NULL;
3317 for (i = 0; new[i] != NULL; ++i)
3319 old = enlist(old, new[i], strlen(new[i]));
3320 if (old == NULL)
3321 break;
3323 return old;
3326 /* Given two lists of substrings, return a new list giving substrings
3327 common to both. */
3328 static char **
3329 inboth (char **left, char **right)
3331 char **both;
3332 char **temp;
3333 int lnum, rnum;
3335 if (left == NULL || right == NULL)
3336 return NULL;
3337 both = (char **) malloc(sizeof *both);
3338 if (both == NULL)
3339 return NULL;
3340 both[0] = NULL;
3341 for (lnum = 0; left[lnum] != NULL; ++lnum)
3343 for (rnum = 0; right[rnum] != NULL; ++rnum)
3345 temp = comsubs(left[lnum], right[rnum]);
3346 if (temp == NULL)
3348 freelist(both);
3349 return NULL;
3351 both = addlists(both, temp);
3352 freelist(temp);
3353 free(temp);
3354 if (both == NULL)
3355 return NULL;
3358 return both;
3361 typedef struct
3363 char **in;
3364 char *left;
3365 char *right;
3366 char *is;
3367 } must;
3369 static void
3370 resetmust (must *mp)
3372 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3373 freelist(mp->in);
3376 static void
3377 dfamust (struct dfa *d)
3379 must *musts;
3380 must *mp;
3381 char *result;
3382 int ri;
3383 int i;
3384 int exact;
3385 token t;
3386 static must must0;
3387 struct dfamust *dm;
3388 static char empty_string[] = "";
3390 result = empty_string;
3391 exact = 0;
3392 musts = (must *) malloc((d->tindex + 1) * sizeof *musts);
3393 if (musts == NULL)
3394 return;
3395 mp = musts;
3396 for (i = 0; i <= d->tindex; ++i)
3397 mp[i] = must0;
3398 for (i = 0; i <= d->tindex; ++i)
3400 mp[i].in = (char **) malloc(sizeof *mp[i].in);
3401 mp[i].left = malloc(2);
3402 mp[i].right = malloc(2);
3403 mp[i].is = malloc(2);
3404 if (mp[i].in == NULL || mp[i].left == NULL ||
3405 mp[i].right == NULL || mp[i].is == NULL)
3406 goto done;
3407 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3408 mp[i].in[0] = NULL;
3410 #ifdef DEBUG
3411 fprintf(stderr, "dfamust:\n");
3412 for (i = 0; i < d->tindex; ++i)
3414 fprintf(stderr, " %d:", i);
3415 prtok(d->tokens[i]);
3417 putc('\n', stderr);
3418 #endif
3419 for (ri = 0; ri < d->tindex; ++ri)
3421 switch (t = d->tokens[ri])
3423 case LPAREN:
3424 case RPAREN:
3425 goto done; /* "cannot happen" */
3426 case EMPTY:
3427 case BEGLINE:
3428 case ENDLINE:
3429 case BEGWORD:
3430 case ENDWORD:
3431 case LIMWORD:
3432 case NOTLIMWORD:
3433 case BACKREF:
3434 resetmust(mp);
3435 break;
3436 case STAR:
3437 case QMARK:
3438 if (mp <= musts)
3439 goto done; /* "cannot happen" */
3440 --mp;
3441 resetmust(mp);
3442 break;
3443 case OR:
3444 case ORTOP:
3445 if (mp < &musts[2])
3446 goto done; /* "cannot happen" */
3448 char **new;
3449 must *lmp;
3450 must *rmp;
3451 int j, ln, rn, n;
3453 rmp = --mp;
3454 lmp = --mp;
3455 /* Guaranteed to be. Unlikely, but. . . */
3456 if (strcmp(lmp->is, rmp->is) != 0)
3457 lmp->is[0] = '\0';
3458 /* Left side--easy */
3459 i = 0;
3460 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3461 ++i;
3462 lmp->left[i] = '\0';
3463 /* Right side */
3464 ln = strlen(lmp->right);
3465 rn = strlen(rmp->right);
3466 n = ln;
3467 if (n > rn)
3468 n = rn;
3469 for (i = 0; i < n; ++i)
3470 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3471 break;
3472 for (j = 0; j < i; ++j)
3473 lmp->right[j] = lmp->right[(ln - i) + j];
3474 lmp->right[j] = '\0';
3475 new = inboth(lmp->in, rmp->in);
3476 if (new == NULL)
3477 goto done;
3478 freelist(lmp->in);
3479 free(lmp->in);
3480 lmp->in = new;
3482 break;
3483 case PLUS:
3484 if (mp <= musts)
3485 goto done; /* "cannot happen" */
3486 --mp;
3487 mp->is[0] = '\0';
3488 break;
3489 case END:
3490 if (mp != &musts[1])
3491 goto done; /* "cannot happen" */
3492 for (i = 0; musts[0].in[i] != NULL; ++i)
3493 if (strlen(musts[0].in[i]) > strlen(result))
3494 result = musts[0].in[i];
3495 if (strcmp(result, musts[0].is) == 0)
3496 exact = 1;
3497 goto done;
3498 case CAT:
3499 if (mp < &musts[2])
3500 goto done; /* "cannot happen" */
3502 must *lmp;
3503 must *rmp;
3505 rmp = --mp;
3506 lmp = --mp;
3507 /* In. Everything in left, plus everything in
3508 right, plus catenation of
3509 left's right and right's left. */
3510 lmp->in = addlists(lmp->in, rmp->in);
3511 if (lmp->in == NULL)
3512 goto done;
3513 if (lmp->right[0] != '\0' &&
3514 rmp->left[0] != '\0')
3516 char *tp;
3518 tp = icpyalloc(lmp->right);
3519 if (tp == NULL)
3520 goto done;
3521 tp = icatalloc(tp, rmp->left);
3522 if (tp == NULL)
3523 goto done;
3524 lmp->in = enlist(lmp->in, tp,
3525 strlen(tp));
3526 free(tp);
3527 if (lmp->in == NULL)
3528 goto done;
3530 /* Left-hand */
3531 if (lmp->is[0] != '\0')
3533 lmp->left = icatalloc(lmp->left,
3534 rmp->left);
3535 if (lmp->left == NULL)
3536 goto done;
3538 /* Right-hand */
3539 if (rmp->is[0] == '\0')
3540 lmp->right[0] = '\0';
3541 lmp->right = icatalloc(lmp->right, rmp->right);
3542 if (lmp->right == NULL)
3543 goto done;
3544 /* Guaranteed to be */
3545 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3547 lmp->is = icatalloc(lmp->is, rmp->is);
3548 if (lmp->is == NULL)
3549 goto done;
3551 else
3552 lmp->is[0] = '\0';
3554 break;
3555 default:
3556 if (t < END)
3558 /* "cannot happen" */
3559 goto done;
3561 else if (t == '\0')
3563 /* not on *my* shift */
3564 goto done;
3566 else if (t >= CSET
3567 #ifdef MBS_SUPPORT
3568 || t == ANYCHAR
3569 || t == MBCSET
3570 #endif /* MBS_SUPPORT */
3573 /* easy enough */
3574 resetmust(mp);
3576 else
3578 /* plain character */
3579 resetmust(mp);
3580 mp->is[0] = mp->left[0] = mp->right[0] = t;
3581 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3582 mp->in = enlist(mp->in, mp->is, (size_t)1);
3583 if (mp->in == NULL)
3584 goto done;
3586 break;
3588 #ifdef DEBUG
3589 fprintf(stderr, " node: %d:", ri);
3590 prtok(d->tokens[ri]);
3591 fprintf(stderr, "\n in:");
3592 for (i = 0; mp->in[i]; ++i)
3593 fprintf(stderr, " \"%s\"", mp->in[i]);
3594 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
3595 fprintf(stderr, " left: \"%s\"\n", mp->left);
3596 fprintf(stderr, " right: \"%s\"\n", mp->right);
3597 #endif
3598 ++mp;
3600 done:
3601 if (strlen(result))
3603 MALLOC(dm, struct dfamust, 1);
3604 dm->exact = exact;
3605 MALLOC(dm->must, char, strlen(result) + 1);
3606 strcpy(dm->must, result);
3607 dm->next = d->musts;
3608 d->musts = dm;
3610 mp = musts;
3611 for (i = 0; i <= d->tindex; ++i)
3613 freelist(mp[i].in);
3614 free(mp[i].in);
3615 free(mp[i].left);
3616 free(mp[i].right);
3617 free(mp[i].is);
3619 free(mp);
3621 /* vim:set shiftwidth=2: */