doc: Document version-etc, version-etc, and argp-version-etc.
[gnulib.git] / lib / dfa.c
blob684043dd8fe09c4c623122263d1728a7f6c70602
1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2025 Free Software
3 Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
21 #include <config.h>
23 #include "dfa.h"
25 #include "flexmember.h"
26 #include "idx.h"
27 #include "verify.h"
29 #include <assert.h>
30 #include <ctype.h>
31 #include <stdint.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <limits.h>
35 #include <string.h>
36 #include <wchar.h>
38 #include "xalloc.h"
39 #include "localeinfo.h"
41 #include "gettext.h"
42 #define _(msgid) dgettext ("gnulib", msgid)
44 #if GAWK
45 /* Use ISO C 99 API. */
46 # include <wctype.h>
47 # define char32_t wchar_t
48 # define mbrtoc32 mbrtowc
49 # define c32rtomb wcrtomb
50 # define c32tob wctob
51 # define c32isprint iswprint
52 # define c32isspace iswspace
53 # define mbszero(p) memset ((p), 0, sizeof (mbstate_t))
54 #else
55 /* Use ISO C 11 + gnulib API. */
56 # include <uchar.h>
57 #endif
59 /* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
60 understandably cannot deduce that the input comes from a
61 well-formed regular expression. There's little point to the
62 runtime overhead of 'assert' instead of 'assume_nonnull' when the
63 MMU will check anyway. */
64 #define assume_nonnull(x) assume ((x) != NULL)
66 static bool
67 str_eq (char const *a, char const *b)
69 return strcmp (a, b) == 0;
72 static bool
73 c_isdigit (char c)
75 return '0' <= c && c <= '9';
78 #ifndef FALLTHROUGH
79 # if 201710L < __STDC_VERSION__
80 # define FALLTHROUGH [[__fallthrough__]]
81 # elif ((__GNUC__ >= 7 && !defined __clang__) \
82 || (defined __apple_build_version__ \
83 ? __apple_build_version__ >= 12000000 \
84 : __clang_major__ >= 10))
85 # define FALLTHROUGH __attribute__ ((__fallthrough__))
86 # else
87 # define FALLTHROUGH ((void) 0)
88 # endif
89 #endif
91 #ifndef MIN
92 # define MIN(a,b) ((a) < (b) ? (a) : (b))
93 #endif
95 /* HPUX defines these as macros in sys/param.h. */
96 #ifdef setbit
97 # undef setbit
98 #endif
99 #ifdef clrbit
100 # undef clrbit
101 #endif
103 /* For code that does not use Gnulib’s isblank module. */
104 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
105 # define isblank dfa_isblank
106 static int
107 isblank (int c)
109 return c == ' ' || c == '\t';
111 #endif
113 /* First integer value that is greater than any character code. */
114 enum { NOTCHAR = 1 << CHAR_BIT };
116 #ifdef UINT_LEAST64_MAX
118 /* Number of bits used in a charclass word. */
119 enum { CHARCLASS_WORD_BITS = 64 };
121 /* This represents part of a character class. It must be unsigned and
122 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
123 typedef uint_least64_t charclass_word;
125 /* Part of a charclass initializer that represents 64 bits' worth of a
126 charclass, where LO and HI are the low and high-order 32 bits of
127 the 64-bit quantity. */
128 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
130 #else
131 /* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
132 enum { CHARCLASS_WORD_BITS = 32 };
133 typedef unsigned long charclass_word;
134 # define CHARCLASS_PAIR(lo, hi) lo, hi
135 #endif
137 /* An initializer for a charclass whose 32-bit words are A through H. */
138 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
139 {{ \
140 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
141 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
144 /* The maximum useful value of a charclass_word; all used bits are 1. */
145 static charclass_word const CHARCLASS_WORD_MASK
146 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
148 /* Number of words required to hold a bit for every character. */
149 enum
151 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
154 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
155 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
157 /* Convert a possibly-signed character to an unsigned character. This is
158 a bit safer than casting to unsigned char, since it catches some type
159 errors that the cast doesn't. */
160 static unsigned char
161 to_uchar (char ch)
163 return ch;
166 /* Contexts tell us whether a character is a newline or a word constituent.
167 Word-constituent characters are those that satisfy iswalnum, plus '_'.
168 Each character has a single CTX_* value; bitmasks of CTX_* values denote
169 a particular character class.
171 A state also stores a context value, which is a bitmask of CTX_* values.
172 A state's context represents a set of characters that the state's
173 predecessors must match. For example, a state whose context does not
174 include CTX_LETTER will never have transitions where the previous
175 character is a word constituent. A state whose context is CTX_ANY
176 might have transitions from any character. */
178 enum
180 CTX_NONE = 1,
181 CTX_LETTER = 2,
182 CTX_NEWLINE = 4,
183 CTX_ANY = 7
186 /* Sometimes characters can only be matched depending on the surrounding
187 context. Such context decisions depend on what the previous character
188 was, and the value of the current (lookahead) character. Context
189 dependent constraints are encoded as 9-bit integers. Each bit that
190 is set indicates that the constraint succeeds in the corresponding
191 context.
193 bit 6-8 - valid contexts when next character is CTX_NEWLINE
194 bit 3-5 - valid contexts when next character is CTX_LETTER
195 bit 0-2 - valid contexts when next character is CTX_NONE
197 succeeds_in_context determines whether a given constraint
198 succeeds in a particular context. Prev is a bitmask of possible
199 context values for the previous character, curr is the (single-bit)
200 context value for the lookahead character. */
201 static int
202 newline_constraint (int constraint)
204 return (constraint >> 6) & 7;
206 static int
207 letter_constraint (int constraint)
209 return (constraint >> 3) & 7;
211 static int
212 other_constraint (int constraint)
214 return constraint & 7;
217 static bool
218 succeeds_in_context (int constraint, int prev, int curr)
220 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
221 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
222 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
223 & prev);
226 /* The following describe what a constraint depends on. */
227 static bool
228 prev_newline_dependent (int constraint)
230 return ((constraint ^ constraint >> 2) & 0111) != 0;
232 static bool
233 prev_letter_dependent (int constraint)
235 return ((constraint ^ constraint >> 1) & 0111) != 0;
238 /* Tokens that match the empty string subject to some constraint actually
239 work by applying that constraint to determine what may follow them,
240 taking into account what has gone before. The following values are
241 the constraints corresponding to the special tokens previously defined. */
242 enum
244 NO_CONSTRAINT = 0777,
245 BEGLINE_CONSTRAINT = 0444,
246 ENDLINE_CONSTRAINT = 0700,
247 BEGWORD_CONSTRAINT = 0050,
248 ENDWORD_CONSTRAINT = 0202,
249 LIMWORD_CONSTRAINT = 0252,
250 NOTLIMWORD_CONSTRAINT = 0525
253 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
254 are operators and others are terminal symbols. Most (but not all) of these
255 codes are returned by the lexical analyzer. */
257 typedef ptrdiff_t token;
258 static token const TOKEN_MAX = PTRDIFF_MAX;
260 /* States are indexed by state_num values. These are normally
261 nonnegative but -1 is used as a special value. */
262 typedef ptrdiff_t state_num;
264 /* Predefined token values. */
265 enum
267 END = -1, /* END is a terminal symbol that matches the
268 end of input; any value of END or less in
269 the parse tree is such a symbol. Accepting
270 states of the DFA are those that would have
271 a transition on END. This is -1, not some
272 more-negative value, to tweak the speed of
273 comparisons to END. */
275 /* Ordinary character values are terminal symbols that match themselves. */
277 /* CSET must come last in the following list of special tokens. Otherwise,
278 the list order matters only for performance. Related special tokens
279 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
280 || CSET <= t) can be done with a single machine-level comparison. */
282 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
283 the empty string. */
285 QMARK, /* QMARK is an operator of one argument that
286 matches zero or one occurrences of its
287 argument. */
289 STAR, /* STAR is an operator of one argument that
290 matches the Kleene closure (zero or more
291 occurrences) of its argument. */
293 PLUS, /* PLUS is an operator of one argument that
294 matches the positive closure (one or more
295 occurrences) of its argument. */
297 REPMN, /* REPMN is a lexical token corresponding
298 to the {m,n} construct. REPMN never
299 appears in the compiled token vector. */
301 CAT, /* CAT is an operator of two arguments that
302 matches the concatenation of its
303 arguments. CAT is never returned by the
304 lexical analyzer. */
306 OR, /* OR is an operator of two arguments that
307 matches either of its arguments. */
309 LPAREN, /* LPAREN never appears in the parse tree,
310 it is only a lexeme. */
312 RPAREN, /* RPAREN never appears in the parse tree. */
314 WCHAR, /* Only returned by lex. wctok contains the
315 32-bit wide character representation. */
317 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
318 a valid multibyte (or single byte) character.
319 It is used only if MB_CUR_MAX > 1. */
321 BEG, /* BEG is an initial symbol that matches the
322 beginning of input. */
324 BEGLINE, /* BEGLINE is a terminal symbol that matches
325 the empty string at the beginning of a
326 line. */
328 ENDLINE, /* ENDLINE is a terminal symbol that matches
329 the empty string at the end of a line. */
331 BEGWORD, /* BEGWORD is a terminal symbol that matches
332 the empty string at the beginning of a
333 word. */
335 ENDWORD, /* ENDWORD is a terminal symbol that matches
336 the empty string at the end of a word. */
338 LIMWORD, /* LIMWORD is a terminal symbol that matches
339 the empty string at the beginning or the
340 end of a word. */
342 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
343 matches the empty string not at
344 the beginning or end of a word. */
346 BACKREF, /* BACKREF is generated by \<digit>
347 or by any other construct that
348 is not completely handled. If the scanner
349 detects a transition on backref, it returns
350 a kind of "semi-success" indicating that
351 the match will have to be verified with
352 a backtracking matcher. */
354 MBCSET, /* MBCSET is similar to CSET, but for
355 multibyte characters. */
357 CSET /* CSET and (and any value greater) is a
358 terminal symbol that matches any of a
359 class of characters. */
363 /* States of the recognizer correspond to sets of positions in the parse
364 tree, together with the constraints under which they may be matched.
365 So a position is encoded as an index into the parse tree together with
366 a constraint. */
367 typedef struct
369 idx_t index; /* Index into the parse array. */
370 unsigned int constraint; /* Constraint for matching this position. */
371 } position;
373 /* Sets of positions are stored as arrays. */
374 typedef struct
376 position *elems; /* Elements of this position set. */
377 idx_t nelem; /* Number of elements in this set. */
378 idx_t alloc; /* Number of elements allocated in ELEMS. */
379 } position_set;
381 /* A state of the dfa consists of a set of positions, some flags,
382 and the token value of the lowest-numbered position of the state that
383 contains an END token. */
384 typedef struct
386 size_t hash; /* Hash of the positions of this state. */
387 position_set elems; /* Positions this state could match. */
388 unsigned char context; /* Context from previous state. */
389 unsigned short constraint; /* Constraint for this state to accept. */
390 position_set mbps; /* Positions which can match multibyte
391 characters or the follows, e.g., period.
392 Used only if MB_CUR_MAX > 1. */
393 state_num mb_trindex; /* Index of this state in MB_TRANS, or
394 negative if the state does not have
395 ANYCHAR. */
396 } dfa_state;
398 /* Maximum for any transition table count. This should be at least 3,
399 for the initial state setup. */
400 enum { MAX_TRCOUNT = 1024 };
402 /* A bracket operator.
403 e.g., [a-c], [[:alpha:]], etc. */
404 struct mb_char_classes
406 ptrdiff_t cset;
407 bool invert;
408 char32_t *chars; /* Normal characters. */
409 idx_t nchars;
410 idx_t nchars_alloc;
413 struct regex_syntax
415 /* Syntax bits controlling the behavior of the lexical analyzer. */
416 reg_syntax_t syntax_bits;
417 int dfaopts;
418 bool syntax_bits_set;
420 /* Flag for case-folding letters into sets. */
421 bool case_fold;
423 /* End-of-line byte in data. */
424 unsigned char eolbyte;
426 /* Cache of char-context values. */
427 char sbit[NOTCHAR];
429 /* If never_trail[B], the byte B cannot be a non-initial byte in a
430 multibyte character. */
431 bool never_trail[NOTCHAR];
433 /* Set of characters considered letters. */
434 charclass letters;
436 /* Set of characters that are newline. */
437 charclass newline;
440 /* Lexical analyzer. All the dross that deals with the obnoxious
441 GNU Regex syntax bits is located here. The poor, suffering
442 reader is referred to the GNU Regex documentation for the
443 meaning of the @#%!@#%^!@ syntax bits. */
444 struct lexer_state
446 char const *ptr; /* Pointer to next input character. */
447 idx_t left; /* Number of characters remaining. */
448 token lasttok; /* Previous token returned; initially END. */
449 idx_t parens; /* Count of outstanding left parens. */
450 int minrep, maxrep; /* Repeat counts for {m,n}. */
452 /* 32-bit wide character representation of the current multibyte character,
453 or WEOF if there was an encoding error. Used only if
454 MB_CUR_MAX > 1. */
455 wint_t wctok;
457 /* The most recently analyzed multibyte bracket expression. */
458 struct mb_char_classes brack;
460 /* We're separated from beginning or (, | only by zero-width characters. */
461 bool laststart;
464 /* Recursive descent parser for regular expressions. */
466 struct parser_state
468 token tok; /* Lookahead token. */
469 idx_t depth; /* Current depth of a hypothetical stack
470 holding deferred productions. This is
471 used to determine the depth that will be
472 required of the real stack later on in
473 dfaanalyze. */
476 /* A compiled regular expression. */
477 struct dfa
479 /* Fields filled by the scanner. */
480 charclass *charclasses; /* Array of character sets for CSET tokens. */
481 idx_t cindex; /* Index for adding new charclasses. */
482 idx_t calloc; /* Number of charclasses allocated. */
483 ptrdiff_t canychar; /* Index of anychar class, or -1. */
485 /* Scanner state */
486 struct lexer_state lex;
488 /* Parser state */
489 struct parser_state parse;
491 /* Fields filled by the parser. */
492 token *tokens; /* Postfix parse array. */
493 idx_t tindex; /* Index for adding new tokens. */
494 idx_t talloc; /* Number of tokens currently allocated. */
495 idx_t depth; /* Depth required of an evaluation stack
496 used for depth-first traversal of the
497 parse tree. */
498 idx_t nleaves; /* Number of non-EMPTY leaves
499 in the parse tree. */
500 idx_t nregexps; /* Count of parallel regexps being built
501 with dfaparse. */
502 bool fast; /* The DFA is fast. */
503 bool epsilon; /* Does a token match only the empty string? */
504 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
505 mbstate_t mbs; /* Multibyte conversion state. */
507 /* The following are valid only if MB_CUR_MAX > 1. */
509 /* The value of multibyte_prop[i] is defined by following rule.
510 if tokens[i] < NOTCHAR
511 bit 0 : tokens[i] is the first byte of a character, including
512 single-byte characters.
513 bit 1 : tokens[i] is the last byte of a character, including
514 single-byte characters.
516 e.g.
517 tokens
518 = 'single_byte_a', 'multi_byte_A', single_byte_b'
519 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
520 multibyte_prop
521 = 3 , 1 , 0 , 2 , 3
523 char *multibyte_prop;
525 /* Fields filled by the superset. */
526 struct dfa *superset; /* Hint of the dfa. */
528 /* Fields filled by the state builder. */
529 dfa_state *states; /* States of the dfa. */
530 state_num sindex; /* Index for adding new states. */
531 idx_t salloc; /* Number of states currently allocated. */
533 /* Fields filled by the parse tree->NFA conversion. */
534 position_set *follows; /* Array of follow sets, indexed by position
535 index. The follow of a position is the set
536 of positions containing characters that
537 could conceivably follow a character
538 matching the given position in a string
539 matching the regexp. Allocated to the
540 maximum possible position index. */
541 bool searchflag; /* We are supposed to build a searching
542 as opposed to an exact matcher. A searching
543 matcher finds the first and shortest string
544 matching a regexp anywhere in the buffer,
545 whereas an exact matcher finds the longest
546 string matching, but anchored to the
547 beginning of the buffer. */
549 /* Fields filled by dfaanalyze. */
550 int *constraints; /* Array of union of accepting constraints
551 in the follow of a position. */
552 int *separates; /* Array of contexts on follow of a
553 position. */
555 /* Fields filled by dfaexec. */
556 state_num tralloc; /* Number of transition tables that have
557 slots so far, not counting trans[-1] and
558 trans[-2]. */
559 int trcount; /* Number of transition tables that have
560 been built, other than for initial
561 states. */
562 int min_trcount; /* Number of initial states. Equivalently,
563 the minimum state number for which trcount
564 counts transitions. */
565 state_num **trans; /* Transition tables for states that can
566 never accept. If the transitions for a
567 state have not yet been computed, or the
568 state could possibly accept, its entry in
569 this table is NULL. This points to two
570 past the start of the allocated array,
571 and trans[-1] and trans[-2] are always
572 NULL. */
573 state_num **fails; /* Transition tables after failing to accept
574 on a state that potentially could do so.
575 If trans[i] is non-null, fails[i] must
576 be null. */
577 char *success; /* Table of acceptance conditions used in
578 dfaexec and computed in build_state. */
579 state_num *newlines; /* Transitions on newlines. The entry for a
580 newline in any transition table is always
581 -1 so we can count lines without wasting
582 too many cycles. The transition for a
583 newline is stored separately and handled
584 as a special case. Newline is also used
585 as a sentinel at the end of the buffer. */
586 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
587 context in multibyte locales, in which we
588 do not distinguish between their contexts,
589 as not supported word. */
590 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
591 state_num **mb_trans; /* Transition tables for states with
592 ANYCHAR. */
593 state_num mb_trcount; /* Number of transition tables for states with
594 ANYCHAR that have actually been built. */
596 /* Syntax configuration. This is near the end so that dfacopysyntax
597 can memset up to here. */
598 struct regex_syntax syntax;
600 /* Information derived from the locale. This is at the end so that
601 a quick memset need not clear it specially. */
603 /* dfaexec implementation. */
604 char *(*dfaexec) (struct dfa *, char const *, char *,
605 bool, idx_t *, bool *);
607 /* Other cached information derived from the locale. */
608 struct localeinfo localeinfo;
611 /* User access to dfa internals. */
613 /* S could possibly be an accepting state of R. */
614 static bool
615 accepting (state_num s, struct dfa const *r)
617 return r->states[s].constraint != 0;
620 /* STATE accepts in the specified context. */
621 static bool
622 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
624 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
627 static void regexp (struct dfa *dfa);
629 /* Store into *PWC the result of converting the leading bytes of the
630 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
631 and updating the conversion state in *D. On conversion error,
632 convert just a single byte, to WEOF. Return the number of bytes
633 converted.
635 This differs from mbrtoc32 (PWC, S, N, &D->mbs) as follows:
637 * PWC points to wint_t, not to char32_t.
638 * The last arg is a dfa *D instead of merely a multibyte conversion
639 state D->mbs.
640 * N is idx_t not size_t, and must be at least 1.
641 * S[N - 1] must be a sentinel byte.
642 * Shift encodings are not supported.
643 * The return value is always in the range 1..N.
644 * D->mbs is always valid afterwards.
645 * *PWC is always set to something. */
646 static int
647 mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
649 unsigned char uc = s[0];
650 wint_t wc = d->localeinfo.sbctowc[uc];
652 if (wc == WEOF)
654 char32_t wch;
655 size_t nbytes = mbrtoc32 (&wch, s, n, &d->mbs);
656 if (0 < nbytes && nbytes < (size_t) -2)
658 *pwc = wch;
659 /* nbytes cannot be == (size) -3 here, since we rely on the
660 'mbrtoc32-regular' module. */
661 return nbytes;
663 mbszero (&d->mbs);
666 *pwc = wc;
667 return 1;
670 #ifdef DEBUG
672 static void
673 prtok (token t)
675 if (t <= END)
676 fprintf (stderr, "END");
677 else if (0 <= t && t < NOTCHAR)
679 unsigned int ch = t;
680 fprintf (stderr, "0x%02x", ch);
682 else
684 char const *s;
685 switch (t)
687 case BEG:
688 s = "BEG";
689 break;
690 case EMPTY:
691 s = "EMPTY";
692 break;
693 case BACKREF:
694 s = "BACKREF";
695 break;
696 case BEGLINE:
697 s = "BEGLINE";
698 break;
699 case ENDLINE:
700 s = "ENDLINE";
701 break;
702 case BEGWORD:
703 s = "BEGWORD";
704 break;
705 case ENDWORD:
706 s = "ENDWORD";
707 break;
708 case LIMWORD:
709 s = "LIMWORD";
710 break;
711 case NOTLIMWORD:
712 s = "NOTLIMWORD";
713 break;
714 case QMARK:
715 s = "QMARK";
716 break;
717 case STAR:
718 s = "STAR";
719 break;
720 case PLUS:
721 s = "PLUS";
722 break;
723 case CAT:
724 s = "CAT";
725 break;
726 case OR:
727 s = "OR";
728 break;
729 case LPAREN:
730 s = "LPAREN";
731 break;
732 case RPAREN:
733 s = "RPAREN";
734 break;
735 case ANYCHAR:
736 s = "ANYCHAR";
737 break;
738 case MBCSET:
739 s = "MBCSET";
740 break;
741 default:
742 s = "CSET";
743 break;
745 fprintf (stderr, "%s", s);
748 #endif /* DEBUG */
750 /* Stuff pertaining to charclasses. */
752 static bool
753 tstbit (unsigned int b, charclass const *c)
755 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
758 static void
759 setbit (unsigned int b, charclass *c)
761 charclass_word one = 1;
762 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
765 static void
766 clrbit (unsigned int b, charclass *c)
768 charclass_word one = 1;
769 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
772 static void
773 zeroset (charclass *s)
775 memset (s, 0, sizeof *s);
778 static void
779 fillset (charclass *s)
781 for (int i = 0; i < CHARCLASS_WORDS; i++)
782 s->w[i] = CHARCLASS_WORD_MASK;
785 static void
786 notset (charclass *s)
788 for (int i = 0; i < CHARCLASS_WORDS; ++i)
789 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
792 static bool
793 equal (charclass const *s1, charclass const *s2)
795 charclass_word w = 0;
796 for (int i = 0; i < CHARCLASS_WORDS; i++)
797 w |= s1->w[i] ^ s2->w[i];
798 return w == 0;
801 static bool
802 emptyset (charclass const *s)
804 charclass_word w = 0;
805 for (int i = 0; i < CHARCLASS_WORDS; i++)
806 w |= s->w[i];
807 return w == 0;
810 /* Ensure that the array addressed by PA holds at least I + 1 items.
811 Either return PA, or reallocate the array and return its new address.
812 Although PA may be null, the returned value is never null.
814 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
815 is updated on reallocation. If PA is null, *NITEMS must be zero.
816 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
817 ITEM_SIZE is the size of one item; it must be positive.
818 Avoid O(N**2) behavior on arrays growing linearly. */
819 static void *
820 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
821 ptrdiff_t nitems_max, idx_t item_size)
823 if (i < *nitems)
824 return pa;
825 return xpalloc (pa, nitems, 1, nitems_max, item_size);
828 /* In DFA D, find the index of charclass S, or allocate a new one. */
829 static idx_t
830 charclass_index (struct dfa *d, charclass const *s)
832 idx_t i;
834 for (i = 0; i < d->cindex; ++i)
835 if (equal (s, &d->charclasses[i]))
836 return i;
837 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
838 TOKEN_MAX - CSET, sizeof *d->charclasses);
839 ++d->cindex;
840 d->charclasses[i] = *s;
841 return i;
844 static bool
845 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
847 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
850 static int
851 char_context (struct dfa const *dfa, unsigned char c)
853 if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
854 return CTX_NEWLINE;
855 if (unibyte_word_constituent (dfa, c))
856 return CTX_LETTER;
857 return CTX_NONE;
860 /* Set a bit in the charclass for the given char32_t. Do nothing if WC
861 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
862 this may happen when folding case in weird Turkish locales where
863 dotless i/dotted I are not included in the chosen character set.
864 Return whether a bit was set in the charclass. */
865 static bool
866 setbit_wc (char32_t wc, charclass *c)
868 int b = c32tob (wc);
869 if (b < 0)
870 return false;
872 setbit (b, c);
873 return true;
876 /* Set a bit for B and its case variants in the charclass C.
877 MB_CUR_MAX must be 1. */
878 static void
879 setbit_case_fold_c (int b, charclass *c)
881 int ub = toupper (b);
882 for (int i = 0; i < NOTCHAR; i++)
883 if (toupper (i) == ub)
884 setbit (i, c);
887 /* Fetch the next lexical input character from the pattern. There
888 must at least one byte of pattern input. Set DFA->lex.wctok to the
889 value of the character or to WEOF depending on whether the input is
890 a valid multibyte character (possibly of length 1). Then return
891 the next input byte value, except return EOF if the input is a
892 multibyte character of length greater than 1. */
893 static int
894 fetch_wc (struct dfa *dfa)
896 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
897 dfa);
898 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
899 dfa->lex.ptr += nbytes;
900 dfa->lex.left -= nbytes;
901 return c;
904 /* If there is no more input, report an error about unbalanced brackets.
905 Otherwise, behave as with fetch_wc (DFA). */
906 static int
907 bracket_fetch_wc (struct dfa *dfa)
909 if (! dfa->lex.left)
910 dfaerror (_("unbalanced ["));
911 return fetch_wc (dfa);
914 typedef int predicate (int);
916 /* The following list maps the names of the Posix named character classes
917 to predicate functions that determine whether a given character is in
918 the class. The leading [ has already been eaten by the lexical
919 analyzer. */
920 struct dfa_ctype
922 const char *name;
923 predicate *func;
924 bool single_byte_only;
927 static const struct dfa_ctype prednames[] = {
928 {"alpha", isalpha, false},
929 {"upper", isupper, false},
930 {"lower", islower, false},
931 {"digit", isdigit, true},
932 {"xdigit", isxdigit, false},
933 {"space", isspace, false},
934 {"punct", ispunct, false},
935 {"alnum", isalnum, false},
936 {"print", isprint, false},
937 {"graph", isgraph, false},
938 {"cntrl", iscntrl, false},
939 {"blank", isblank, false},
940 {NULL, NULL, false}
943 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
944 find_pred (const char *str)
946 for (int i = 0; prednames[i].name; i++)
947 if (str_eq (str, prednames[i].name))
948 return &prednames[i];
949 return NULL;
952 /* Parse a bracket expression, which possibly includes multibyte
953 characters. */
954 static token
955 parse_bracket_exp (struct dfa *dfa)
957 /* This is a bracket expression that dfaexec is known to
958 process correctly. */
959 bool known_bracket_exp = true;
961 /* Used to warn about [:space:].
962 Bit 0 = first character is a colon.
963 Bit 1 = last character is a colon.
964 Bit 2 = includes any other character but a colon.
965 Bit 3 = includes ranges, char/equiv classes or collation elements. */
966 int colon_warning_state;
968 dfa->lex.brack.nchars = 0;
969 charclass ccl;
970 zeroset (&ccl);
971 int c = bracket_fetch_wc (dfa);
972 bool invert = c == '^';
973 if (invert)
975 c = bracket_fetch_wc (dfa);
976 known_bracket_exp = dfa->localeinfo.simple;
978 wint_t wc = dfa->lex.wctok;
979 int c1;
980 wint_t wc1;
981 colon_warning_state = (c == ':');
984 c1 = NOTCHAR; /* Mark c1 as not initialized. */
985 colon_warning_state &= ~2;
987 /* Note that if we're looking at some other [:...:] construct,
988 we just treat it as a bunch of ordinary characters. We can do
989 this because we assume regex has checked for syntax errors before
990 dfa is ever called. */
991 if (c == '[')
993 c1 = bracket_fetch_wc (dfa);
994 wc1 = dfa->lex.wctok;
996 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
997 || c1 == '.' || c1 == '=')
999 enum { MAX_BRACKET_STRING_LEN = 32 };
1000 char str[MAX_BRACKET_STRING_LEN + 1];
1001 int len = 0;
1002 for (;;)
1004 c = bracket_fetch_wc (dfa);
1005 if (dfa->lex.left == 0
1006 || (c == c1 && dfa->lex.ptr[0] == ']'))
1007 break;
1008 if (len < MAX_BRACKET_STRING_LEN)
1009 str[len++] = c;
1010 else
1011 /* This is in any case an invalid class name. */
1012 str[0] = '\0';
1014 str[len] = '\0';
1016 /* Fetch bracket. */
1017 c = bracket_fetch_wc (dfa);
1018 wc = dfa->lex.wctok;
1019 if (c1 == ':')
1020 /* Build character class. POSIX allows character
1021 classes to match multicharacter collating elements,
1022 but the regex code does not support that, so do not
1023 worry about that possibility. */
1025 char const *class
1026 = (dfa->syntax.case_fold && (str_eq (str, "upper")
1027 || str_eq (str, "lower"))
1028 ? "alpha" : str);
1029 const struct dfa_ctype *pred = find_pred (class);
1030 if (!pred)
1031 dfaerror (_("invalid character class"));
1033 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1034 known_bracket_exp = false;
1035 else
1036 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1037 if (pred->func (c2))
1038 setbit (c2, &ccl);
1040 else
1041 known_bracket_exp = false;
1043 colon_warning_state |= 8;
1045 /* Fetch new lookahead character. */
1046 c1 = bracket_fetch_wc (dfa);
1047 wc1 = dfa->lex.wctok;
1048 continue;
1051 /* We treat '[' as a normal character here. c/c1/wc/wc1
1052 are already set up. */
1055 if (c == '\\'
1056 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1058 c = bracket_fetch_wc (dfa);
1059 wc = dfa->lex.wctok;
1062 if (c1 == NOTCHAR)
1064 c1 = bracket_fetch_wc (dfa);
1065 wc1 = dfa->lex.wctok;
1068 if (c1 == '-')
1069 /* build range characters. */
1071 int c2 = bracket_fetch_wc (dfa);
1072 wint_t wc2 = dfa->lex.wctok;
1074 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1075 Treat it like [-a[.aa.]] while parsing it, and
1076 remember that the set is unknown. */
1077 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1079 known_bracket_exp = false;
1080 c2 = ']';
1083 if (c2 == ']')
1085 /* In the case [x-], the - is an ordinary hyphen,
1086 which is left in c1, the lookahead character. */
1087 dfa->lex.ptr--;
1088 dfa->lex.left++;
1090 else
1092 if (c2 == '\\' && (dfa->syntax.syntax_bits
1093 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1095 c2 = bracket_fetch_wc (dfa);
1096 wc2 = dfa->lex.wctok;
1099 colon_warning_state |= 8;
1100 c1 = bracket_fetch_wc (dfa);
1101 wc1 = dfa->lex.wctok;
1103 /* Treat [x-y] as a range if x != y. */
1104 if (wc != wc2 || wc == WEOF)
1106 if (dfa->localeinfo.simple
1107 || (c_isdigit (c) & c_isdigit (c2)))
1109 for (int ci = c; ci <= c2; ci++)
1110 if (dfa->syntax.case_fold && isalpha (ci))
1111 setbit_case_fold_c (ci, &ccl);
1112 else
1113 setbit (ci, &ccl);
1115 else
1116 known_bracket_exp = false;
1118 continue;
1123 colon_warning_state |= (c == ':') ? 2 : 4;
1125 if (!dfa->localeinfo.multibyte)
1127 if (dfa->syntax.case_fold && isalpha (c))
1128 setbit_case_fold_c (c, &ccl);
1129 else
1130 setbit (c, &ccl);
1131 continue;
1134 if (wc == WEOF)
1135 known_bracket_exp = false;
1136 else
1138 char32_t folded[CASE_FOLDED_BUFSIZE + 1];
1139 int n = (dfa->syntax.case_fold
1140 ? case_folded_counterparts (wc, folded + 1) + 1
1141 : 1);
1142 folded[0] = wc;
1143 for (int i = 0; i < n; i++)
1144 if (!setbit_wc (folded[i], &ccl))
1146 dfa->lex.brack.chars
1147 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1148 &dfa->lex.brack.nchars_alloc, -1,
1149 sizeof *dfa->lex.brack.chars);
1150 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1154 while ((wc = wc1, (c = c1) != ']'));
1156 if (colon_warning_state == 7)
1158 char const *msg
1159 = _("character class syntax is [[:space:]], not [:space:]");
1160 if (dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR)
1161 dfaerror (msg);
1162 else
1163 dfawarn (msg);
1166 if (! known_bracket_exp)
1167 return BACKREF;
1169 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1171 dfa->lex.brack.invert = invert;
1172 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1173 return MBCSET;
1176 if (invert)
1178 notset (&ccl);
1179 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1180 clrbit ('\n', &ccl);
1183 return CSET + charclass_index (dfa, &ccl);
1186 struct lexptr
1188 char const *ptr;
1189 idx_t left;
1192 static void
1193 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1195 ls->ptr = dfa->lex.ptr;
1196 ls->left = dfa->lex.left;
1197 dfa->lex.ptr = s;
1198 dfa->lex.left = strlen (s);
1201 static void
1202 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1204 dfa->lex.ptr = ls->ptr;
1205 dfa->lex.left = ls->left;
1208 static token
1209 lex (struct dfa *dfa)
1211 bool backslash = false;
1213 /* Basic plan: We fetch a character. If it's a backslash,
1214 we set the backslash flag and go through the loop again.
1215 On the plus side, this avoids having a duplicate of the
1216 main switch inside the backslash case. On the minus side,
1217 it means that just about every case tests the backslash flag. */
1218 for (int i = 0; ; i++)
1220 /* This loop should consume at most a backslash and some other
1221 character. */
1222 assume (i < 2);
1224 if (! dfa->lex.left)
1225 return dfa->lex.lasttok = END;
1226 int c = fetch_wc (dfa);
1228 switch (c)
1230 case '\\':
1231 if (backslash)
1232 goto normal_char;
1233 if (dfa->lex.left == 0)
1234 dfaerror (_("unfinished \\ escape"));
1235 backslash = true;
1236 break;
1238 case '^':
1239 if (backslash)
1240 goto normal_char;
1241 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1242 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1243 || dfa->lex.lasttok == OR)
1244 return dfa->lex.lasttok = BEGLINE;
1245 goto normal_char;
1247 case '$':
1248 if (backslash)
1249 goto normal_char;
1250 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1251 || dfa->lex.left == 0
1252 || ((dfa->lex.left
1253 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1254 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1255 & (dfa->lex.ptr[0] == '\\')]
1256 == ')'))
1257 || ((dfa->lex.left
1258 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1259 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1260 & (dfa->lex.ptr[0] == '\\')]
1261 == '|'))
1262 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1263 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1264 return dfa->lex.lasttok = ENDLINE;
1265 goto normal_char;
1267 case '1':
1268 case '2':
1269 case '3':
1270 case '4':
1271 case '5':
1272 case '6':
1273 case '7':
1274 case '8':
1275 case '9':
1276 if (!backslash)
1277 goto normal_char;
1278 if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1279 goto stray_backslash;
1281 dfa->lex.laststart = false;
1282 return dfa->lex.lasttok = BACKREF;
1284 case '`':
1285 if (!backslash)
1286 goto normal_char;
1287 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1288 goto stray_backslash;
1290 /* FIXME: should be beginning of string */
1291 return dfa->lex.lasttok = BEGLINE;
1293 case '\'':
1294 if (!backslash)
1295 goto normal_char;
1296 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1297 goto stray_backslash;
1299 /* FIXME: should be end of string */
1300 return dfa->lex.lasttok = ENDLINE;
1302 case '<':
1303 if (!backslash)
1304 goto normal_char;
1305 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1306 goto stray_backslash;
1308 return dfa->lex.lasttok = BEGWORD;
1310 case '>':
1311 if (!backslash)
1312 goto normal_char;
1313 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1314 goto stray_backslash;
1316 return dfa->lex.lasttok = ENDWORD;
1318 case 'b':
1319 if (!backslash)
1320 goto normal_char;
1321 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1322 goto stray_backslash;
1324 return dfa->lex.lasttok = LIMWORD;
1326 case 'B':
1327 if (!backslash)
1328 goto normal_char;
1329 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1330 goto stray_backslash;
1332 return dfa->lex.lasttok = NOTLIMWORD;
1334 case '?':
1335 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1336 goto default_case;
1337 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1338 goto normal_char;
1339 if (dfa->lex.laststart)
1341 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1342 goto default_case;
1343 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1344 dfawarn (_("? at start of expression"));
1346 return dfa->lex.lasttok = QMARK;
1348 case '*':
1349 if (backslash)
1350 goto normal_char;
1351 if (dfa->lex.laststart)
1353 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1354 goto default_case;
1355 if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1356 dfawarn (_("* at start of expression"));
1358 return dfa->lex.lasttok = STAR;
1360 case '+':
1361 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1362 goto default_case;
1363 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1364 goto normal_char;
1365 if (dfa->lex.laststart)
1367 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1368 goto default_case;
1369 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1370 dfawarn (_("+ at start of expression"));
1372 return dfa->lex.lasttok = PLUS;
1374 case '{':
1375 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1376 goto default_case;
1377 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1378 goto normal_char;
1380 /* Cases:
1381 {M} - exact count
1382 {M,} - minimum count, maximum is infinity
1383 {,N} - 0 through N
1384 {,} - 0 to infinity (same as '*')
1385 {M,N} - M through N */
1387 char const *p = dfa->lex.ptr;
1388 char const *lim = p + dfa->lex.left;
1389 dfa->lex.minrep = dfa->lex.maxrep = -1;
1390 for (; p != lim && c_isdigit (*p); p++)
1391 dfa->lex.minrep = (dfa->lex.minrep < 0
1392 ? *p - '0'
1393 : MIN (RE_DUP_MAX + 1,
1394 dfa->lex.minrep * 10 + *p - '0'));
1395 if (p != lim)
1397 if (*p != ',')
1398 dfa->lex.maxrep = dfa->lex.minrep;
1399 else
1401 if (dfa->lex.minrep < 0)
1402 dfa->lex.minrep = 0;
1403 while (++p != lim && c_isdigit (*p))
1404 dfa->lex.maxrep
1405 = (dfa->lex.maxrep < 0
1406 ? *p - '0'
1407 : MIN (RE_DUP_MAX + 1,
1408 dfa->lex.maxrep * 10 + *p - '0'));
1411 bool invalid_content
1412 = ! ((! backslash || (p != lim && *p++ == '\\'))
1413 && p != lim && *p++ == '}'
1414 && 0 <= dfa->lex.minrep
1415 && (dfa->lex.maxrep < 0
1416 || dfa->lex.minrep <= dfa->lex.maxrep));
1417 if (invalid_content
1418 && (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1419 goto normal_char;
1420 if (dfa->lex.laststart)
1422 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1423 goto default_case;
1424 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1425 dfawarn (_("{...} at start of expression"));
1427 if (invalid_content)
1428 dfaerror (_("invalid content of \\{\\}"));
1429 if (RE_DUP_MAX < dfa->lex.maxrep)
1430 dfaerror (_("regular expression too big"));
1431 dfa->lex.ptr = p;
1432 dfa->lex.left = lim - p;
1434 dfa->lex.laststart = false;
1435 return dfa->lex.lasttok = REPMN;
1437 case '|':
1438 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1439 goto default_case;
1440 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1441 goto normal_char;
1442 dfa->lex.laststart = true;
1443 return dfa->lex.lasttok = OR;
1445 case '\n':
1446 if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1447 goto default_case;
1448 if (backslash)
1449 goto normal_char;
1450 dfa->lex.laststart = true;
1451 return dfa->lex.lasttok = OR;
1453 case '(':
1454 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1455 goto normal_char;
1456 dfa->lex.parens++;
1457 dfa->lex.laststart = true;
1458 return dfa->lex.lasttok = LPAREN;
1460 case ')':
1461 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1462 goto normal_char;
1463 if (dfa->lex.parens == 0
1464 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1465 goto normal_char;
1466 dfa->lex.parens--;
1467 dfa->lex.laststart = false;
1468 return dfa->lex.lasttok = RPAREN;
1470 case '.':
1471 if (backslash)
1472 goto normal_char;
1473 if (dfa->canychar < 0)
1475 charclass ccl;
1476 fillset (&ccl);
1477 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1478 clrbit ('\n', &ccl);
1479 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1480 clrbit ('\0', &ccl);
1481 if (dfa->localeinfo.multibyte)
1482 for (int c2 = 0; c2 < NOTCHAR; c2++)
1483 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1484 clrbit (c2, &ccl);
1485 dfa->canychar = charclass_index (dfa, &ccl);
1487 dfa->lex.laststart = false;
1488 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1489 ? ANYCHAR
1490 : CSET + dfa->canychar);
1492 case 's':
1493 case 'S':
1494 if (!backslash)
1495 goto normal_char;
1496 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1497 goto stray_backslash;
1499 if (!dfa->localeinfo.multibyte)
1501 charclass ccl;
1502 zeroset (&ccl);
1503 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1504 if (isspace (c2))
1505 setbit (c2, &ccl);
1506 if (c == 'S')
1507 notset (&ccl);
1508 dfa->lex.laststart = false;
1509 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1512 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1513 add_utf8_anychar, makes sense. */
1515 /* \s and \S are documented to be equivalent to [[:space:]] and
1516 [^[:space:]] respectively, so tell the lexer to process those
1517 strings, each minus its "already processed" '['. */
1519 struct lexptr ls;
1520 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1521 dfa->lex.lasttok = parse_bracket_exp (dfa);
1522 pop_lex_state (dfa, &ls);
1525 dfa->lex.laststart = false;
1526 return dfa->lex.lasttok;
1528 case 'w':
1529 case 'W':
1530 if (!backslash)
1531 goto normal_char;
1532 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1533 goto stray_backslash;
1535 if (!dfa->localeinfo.multibyte)
1537 charclass ccl;
1538 zeroset (&ccl);
1539 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1540 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1541 setbit (c2, &ccl);
1542 if (c == 'W')
1543 notset (&ccl);
1544 dfa->lex.laststart = false;
1545 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1548 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1549 add_utf8_anychar, makes sense. */
1551 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1552 [^_[:alnum:]] respectively, so tell the lexer to process those
1553 strings, each minus its "already processed" '['. */
1555 struct lexptr ls;
1556 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1557 dfa->lex.lasttok = parse_bracket_exp (dfa);
1558 pop_lex_state (dfa, &ls);
1561 dfa->lex.laststart = false;
1562 return dfa->lex.lasttok;
1564 case '[':
1565 if (backslash)
1566 goto normal_char;
1567 dfa->lex.laststart = false;
1568 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1570 default:
1571 default_case:
1572 if (!backslash)
1573 goto normal_char;
1574 stray_backslash:
1575 if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1577 char const *msg;
1578 char msgbuf[100];
1579 if (!c32isprint (dfa->lex.wctok))
1580 msg = _("stray \\ before unprintable character");
1581 else if (c32isspace (dfa->lex.wctok))
1582 msg = _("stray \\ before white space");
1583 else
1585 char buf[MB_LEN_MAX + 1];
1586 mbstate_t s;
1587 mbszero (&s);
1588 size_t stored_bytes = c32rtomb (buf, dfa->lex.wctok, &s);
1589 if (stored_bytes < (size_t) -1)
1591 buf[stored_bytes] = '\0';
1592 int n = snprintf (msgbuf, sizeof msgbuf,
1593 _("stray \\ before %s"), buf);
1594 msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1596 else
1597 msg = _("stray \\");
1599 dfawarn (msg);
1601 FALLTHROUGH;
1602 case ']': case '}':
1603 normal_char:
1604 dfa->lex.laststart = false;
1605 /* For multibyte character sets, folding is done in atom. Always
1606 return WCHAR. */
1607 if (dfa->localeinfo.multibyte)
1608 return dfa->lex.lasttok = WCHAR;
1610 if (dfa->syntax.case_fold && isalpha (c))
1612 charclass ccl;
1613 zeroset (&ccl);
1614 setbit_case_fold_c (c, &ccl);
1615 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1618 return dfa->lex.lasttok = c;
1623 static void
1624 addtok_mb (struct dfa *dfa, token t, char mbprop)
1626 if (dfa->talloc == dfa->tindex)
1628 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1629 sizeof *dfa->tokens);
1630 if (dfa->localeinfo.multibyte)
1631 dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1632 sizeof *dfa->multibyte_prop);
1634 if (dfa->localeinfo.multibyte)
1635 dfa->multibyte_prop[dfa->tindex] = mbprop;
1636 dfa->tokens[dfa->tindex++] = t;
1638 switch (t)
1640 case QMARK:
1641 case STAR:
1642 case PLUS:
1643 break;
1645 case CAT:
1646 case OR:
1647 dfa->parse.depth--;
1648 break;
1650 case EMPTY:
1651 dfa->epsilon = true;
1652 goto increment_depth;
1654 case BACKREF:
1655 dfa->fast = false;
1656 goto increment_nleaves;
1658 case BEGLINE:
1659 case ENDLINE:
1660 case BEGWORD:
1661 case ENDWORD:
1662 case LIMWORD:
1663 case NOTLIMWORD:
1664 dfa->epsilon = true;
1665 FALLTHROUGH;
1666 default:
1667 increment_nleaves:
1668 dfa->nleaves++;
1669 increment_depth:
1670 dfa->parse.depth++;
1671 if (dfa->depth < dfa->parse.depth)
1672 dfa->depth = dfa->parse.depth;
1673 break;
1677 static void addtok_wc (struct dfa *dfa, wint_t wc);
1679 /* Add the given token to the parse tree, maintaining the depth count and
1680 updating the maximum depth if necessary. */
1681 static void
1682 addtok (struct dfa *dfa, token t)
1684 if (dfa->localeinfo.multibyte && t == MBCSET)
1686 bool need_or = false;
1688 /* Extract wide characters into alternations for better performance.
1689 This does not require UTF-8. */
1690 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1692 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1693 if (need_or)
1694 addtok (dfa, OR);
1695 need_or = true;
1697 dfa->lex.brack.nchars = 0;
1699 /* Wide characters have been handled above, so it is possible
1700 that the set is empty now. Do nothing in that case. */
1701 if (dfa->lex.brack.cset != -1)
1703 addtok (dfa, CSET + dfa->lex.brack.cset);
1704 if (need_or)
1705 addtok (dfa, OR);
1708 else
1710 addtok_mb (dfa, t, 3);
1714 /* We treat a multibyte character as a single atom, so that DFA
1715 can treat a multibyte character as a single expression.
1717 e.g., we construct the following tree from "<mb1><mb2>".
1718 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1719 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1720 static void
1721 addtok_wc (struct dfa *dfa, wint_t wc)
1723 unsigned char buf[MB_LEN_MAX];
1724 mbstate_t s;
1725 mbszero (&s);
1726 size_t stored_bytes = c32rtomb ((char *) buf, wc, &s);
1727 int buflen;
1729 if (stored_bytes != (size_t) -1)
1730 buflen = stored_bytes;
1731 else
1733 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1734 the addtok_mb call altogether can corrupt the heap. */
1735 buflen = 1;
1736 buf[0] = 0;
1739 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1740 for (int i = 1; i < buflen; i++)
1742 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1743 addtok (dfa, CAT);
1747 static void
1748 add_utf8_anychar (struct dfa *dfa)
1750 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1751 UTF-8 byte sequence has been defined as follows:
1753 ([\x00-\x7f]
1754 |[\xc2-\xdf][\x80-\xbf]
1755 |[\xe0][\xa0-\xbf][\x80-\xbf]
1756 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1757 |[\xed][\x80-\x9f][\x80-\xbf]
1758 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1759 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1760 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1762 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1763 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1764 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1765 H = [\x80-\x9f], I = [\xf0],
1766 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1768 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1770 /* Mnemonics for classes containing two or more bytes. */
1771 enum { A, B, C, E, F, H, J, K, M };
1773 /* Mnemonics for single-byte tokens. */
1774 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1776 static charclass const utf8_classes[] = {
1777 /* A. 00-7f: 1-byte sequence. */
1778 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1780 /* B. c2-df: 1st byte of a 2-byte sequence. */
1781 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1783 /* C. 80-bf: non-leading bytes. */
1784 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1786 /* D. e0 (just a token). */
1788 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1789 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1791 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1792 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1794 /* G. ed (just a token). */
1796 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1797 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1799 /* I. f0 (just a token). */
1801 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1802 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1804 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1805 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1807 /* L. f4 (just a token). */
1809 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1810 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1813 /* Define the character classes that are needed below. */
1814 if (dfa->utf8_anychar_classes[0] == 0)
1816 charclass c = utf8_classes[0];
1817 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1818 clrbit ('\n', &c);
1819 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1820 clrbit ('\0', &c);
1821 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1823 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1824 dfa->utf8_anychar_classes[i]
1825 = CSET + charclass_index (dfa, &utf8_classes[i]);
1828 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1829 The token buffer is in reverse Polish order, so we get
1830 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1831 C CAT OR C CAT OR C CAT OR". */
1832 addtok (dfa, dfa->utf8_anychar_classes[A]);
1833 addtok (dfa, dfa->utf8_anychar_classes[B]);
1834 addtok (dfa, D_token);
1835 addtok (dfa, dfa->utf8_anychar_classes[E]);
1836 addtok (dfa, CAT);
1837 addtok (dfa, OR);
1838 addtok (dfa, G_token);
1839 addtok (dfa, dfa->utf8_anychar_classes[H]);
1840 addtok (dfa, CAT);
1841 addtok (dfa, OR);
1842 addtok (dfa, dfa->utf8_anychar_classes[F]);
1843 addtok (dfa, I_token);
1844 addtok (dfa, dfa->utf8_anychar_classes[J]);
1845 addtok (dfa, CAT);
1846 addtok (dfa, OR);
1847 addtok (dfa, L_token);
1848 addtok (dfa, dfa->utf8_anychar_classes[M]);
1849 addtok (dfa, CAT);
1850 addtok (dfa, OR);
1851 addtok (dfa, dfa->utf8_anychar_classes[K]);
1852 for (int i = 0; i < 3; i++)
1854 addtok (dfa, dfa->utf8_anychar_classes[C]);
1855 addtok (dfa, CAT);
1856 addtok (dfa, OR);
1860 /* The grammar understood by the parser is as follows.
1862 regexp:
1863 regexp OR branch
1864 branch
1866 branch:
1867 branch closure
1868 closure
1870 closure:
1871 closure QMARK
1872 closure STAR
1873 closure PLUS
1874 closure REPMN
1875 atom
1877 atom:
1878 <normal character>
1879 <multibyte character>
1880 ANYCHAR
1881 MBCSET
1882 CSET
1883 BACKREF
1884 BEGLINE
1885 ENDLINE
1886 BEGWORD
1887 ENDWORD
1888 LIMWORD
1889 NOTLIMWORD
1890 LPAREN regexp RPAREN
1891 <empty>
1893 The parser builds a parse tree in postfix form in an array of tokens. */
1895 static void
1896 atom (struct dfa *dfa)
1898 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1899 || dfa->parse.tok >= CSET
1900 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1901 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1902 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1903 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1904 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1906 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1908 /* For UTF-8 expand the period to a series of CSETs that define a
1909 valid UTF-8 character. This avoids using the slow multibyte
1910 path. I'm pretty sure it would be both profitable and correct to
1911 do it for any encoding; however, the optimization must be done
1912 manually as it is done above in add_utf8_anychar. So, let's
1913 start with UTF-8: it is the most used, and the structure of the
1914 encoding makes the correctness more obvious. */
1915 add_utf8_anychar (dfa);
1917 else
1918 addtok (dfa, dfa->parse.tok);
1919 dfa->parse.tok = lex (dfa);
1921 else if (dfa->parse.tok == WCHAR)
1923 if (dfa->lex.wctok == WEOF)
1924 addtok (dfa, BACKREF);
1925 else
1927 addtok_wc (dfa, dfa->lex.wctok);
1929 if (dfa->syntax.case_fold)
1931 char32_t folded[CASE_FOLDED_BUFSIZE];
1932 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1933 for (int i = 0; i < n; i++)
1935 addtok_wc (dfa, folded[i]);
1936 addtok (dfa, OR);
1941 dfa->parse.tok = lex (dfa);
1943 else if (dfa->parse.tok == LPAREN)
1945 dfa->parse.tok = lex (dfa);
1946 regexp (dfa);
1947 if (dfa->parse.tok != RPAREN)
1948 dfaerror (_("unbalanced ("));
1949 dfa->parse.tok = lex (dfa);
1951 else
1952 addtok (dfa, EMPTY);
1955 /* Return the number of tokens in the given subexpression. */
1956 static idx_t _GL_ATTRIBUTE_PURE
1957 nsubtoks (struct dfa const *dfa, idx_t tindex)
1959 switch (dfa->tokens[tindex - 1])
1961 default:
1962 return 1;
1963 case QMARK:
1964 case STAR:
1965 case PLUS:
1966 return 1 + nsubtoks (dfa, tindex - 1);
1967 case CAT:
1968 case OR:
1970 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1971 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1976 /* Copy the given subexpression to the top of the tree. */
1977 static void
1978 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1980 if (dfa->localeinfo.multibyte)
1981 for (idx_t i = 0; i < ntokens; i++)
1982 addtok_mb (dfa, dfa->tokens[tindex + i],
1983 dfa->multibyte_prop[tindex + i]);
1984 else
1985 for (idx_t i = 0; i < ntokens; i++)
1986 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1989 static void
1990 closure (struct dfa *dfa)
1992 atom (dfa);
1993 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1994 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1995 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1997 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1998 idx_t tindex = dfa->tindex - ntokens;
1999 if (dfa->lex.maxrep < 0)
2000 addtok (dfa, PLUS);
2001 if (dfa->lex.minrep == 0)
2002 addtok (dfa, QMARK);
2003 int i;
2004 for (i = 1; i < dfa->lex.minrep; i++)
2006 copytoks (dfa, tindex, ntokens);
2007 addtok (dfa, CAT);
2009 for (; i < dfa->lex.maxrep; i++)
2011 copytoks (dfa, tindex, ntokens);
2012 addtok (dfa, QMARK);
2013 addtok (dfa, CAT);
2015 dfa->parse.tok = lex (dfa);
2017 else if (dfa->parse.tok == REPMN)
2019 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
2020 dfa->parse.tok = lex (dfa);
2021 closure (dfa);
2023 else
2025 addtok (dfa, dfa->parse.tok);
2026 dfa->parse.tok = lex (dfa);
2030 static void
2031 branch (struct dfa* dfa)
2033 closure (dfa);
2034 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2035 && dfa->parse.tok >= 0)
2037 closure (dfa);
2038 addtok (dfa, CAT);
2042 static void
2043 regexp (struct dfa *dfa)
2045 branch (dfa);
2046 while (dfa->parse.tok == OR)
2048 dfa->parse.tok = lex (dfa);
2049 branch (dfa);
2050 addtok (dfa, OR);
2054 /* Parse a string S of length LEN into D. S can include NUL characters.
2055 This is the main entry point for the parser. */
2056 void
2057 dfaparse (char const *s, idx_t len, struct dfa *d)
2059 d->lex.ptr = s;
2060 d->lex.left = len;
2061 d->lex.lasttok = END;
2062 d->lex.laststart = true;
2064 if (!d->syntax.syntax_bits_set)
2065 dfaerror (_("no syntax specified"));
2067 if (!d->nregexps)
2068 addtok (d, BEG);
2070 d->parse.tok = lex (d);
2071 d->parse.depth = d->depth;
2073 regexp (d);
2075 if (d->parse.tok != END)
2076 dfaerror (_("unbalanced )"));
2078 addtok (d, END - d->nregexps);
2079 addtok (d, CAT);
2081 if (d->nregexps)
2082 addtok (d, OR);
2084 ++d->nregexps;
2087 /* Some primitives for operating on sets of positions. */
2089 /* Copy one set to another. */
2090 static void
2091 copy (position_set const *src, position_set *dst)
2093 if (dst->alloc < src->nelem)
2095 free (dst->elems);
2096 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2097 sizeof *dst->elems);
2099 dst->nelem = src->nelem;
2100 if (src->nelem != 0)
2101 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2104 static void
2105 alloc_position_set (position_set *s, idx_t size)
2107 s->elems = xnmalloc (size, sizeof *s->elems);
2108 s->alloc = size;
2109 s->nelem = 0;
2112 /* Insert position P in set S. S is maintained in sorted order on
2113 decreasing index. If there is already an entry in S with P.index
2114 then merge (logically-OR) P's constraints into the one in S.
2115 S->elems must point to an array large enough to hold the resulting set. */
2116 static void
2117 insert (position p, position_set *s)
2119 idx_t count = s->nelem;
2120 idx_t lo = 0, hi = count;
2121 while (lo < hi)
2123 idx_t mid = (lo + hi) >> 1;
2124 if (s->elems[mid].index < p.index)
2125 lo = mid + 1;
2126 else if (s->elems[mid].index == p.index)
2128 s->elems[mid].constraint |= p.constraint;
2129 return;
2131 else
2132 hi = mid;
2135 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2136 for (idx_t i = count; i > lo; i--)
2137 s->elems[i] = s->elems[i - 1];
2138 s->elems[lo] = p;
2139 ++s->nelem;
2142 static void
2143 append (position p, position_set *s)
2145 idx_t count = s->nelem;
2146 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2147 s->elems[s->nelem++] = p;
2150 /* Merge S1 and S2 (with the additional constraint C2) into M. The
2151 result is as if the positions of S1, and of S2 with the additional
2152 constraint C2, were inserted into an initially empty set. */
2153 static void
2154 merge_constrained (position_set const *s1, position_set const *s2,
2155 unsigned int c2, position_set *m)
2157 idx_t i = 0, j = 0;
2159 if (m->alloc - s1->nelem < s2->nelem)
2161 free (m->elems);
2162 m->alloc = s1->nelem;
2163 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2165 m->nelem = 0;
2166 while (i < s1->nelem || j < s2->nelem)
2167 if (! (j < s2->nelem)
2168 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2170 unsigned int c = ((i < s1->nelem && j < s2->nelem
2171 && s1->elems[i].index == s2->elems[j].index)
2172 ? s2->elems[j++].constraint & c2
2173 : 0);
2174 m->elems[m->nelem].index = s1->elems[i].index;
2175 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2177 else
2179 if (s2->elems[j].constraint & c2)
2181 m->elems[m->nelem].index = s2->elems[j].index;
2182 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2184 j++;
2188 /* Merge two sets of positions into a third. The result is exactly as if
2189 the positions of both sets were inserted into an initially empty set. */
2190 static void
2191 merge (position_set const *s1, position_set const *s2, position_set *m)
2193 merge_constrained (s1, s2, -1, m);
2196 /* Merge into DST all the elements of SRC, possibly destroying
2197 the contents of the temporary M. */
2198 static void
2199 merge2 (position_set *dst, position_set const *src, position_set *m)
2201 if (src->nelem < 4)
2203 for (idx_t i = 0; i < src->nelem; i++)
2204 insert (src->elems[i], dst);
2206 else
2208 merge (src, dst, m);
2209 copy (m, dst);
2213 /* Delete a position from a set. Return the nonzero constraint of the
2214 deleted position, or zero if there was no such position. */
2215 static unsigned int
2216 delete (idx_t del, position_set *s)
2218 idx_t count = s->nelem;
2219 idx_t lo = 0, hi = count;
2220 while (lo < hi)
2222 idx_t mid = (lo + hi) >> 1;
2223 if (s->elems[mid].index < del)
2224 lo = mid + 1;
2225 else if (s->elems[mid].index == del)
2227 unsigned int c = s->elems[mid].constraint;
2228 idx_t i;
2229 for (i = mid; i + 1 < count; i++)
2230 s->elems[i] = s->elems[i + 1];
2231 s->nelem = i;
2232 return c;
2234 else
2235 hi = mid;
2237 return 0;
2240 /* Replace a position with the followed set. */
2241 static void
2242 replace (position_set *dst, idx_t del, position_set *add,
2243 unsigned int constraint, position_set *tmp)
2245 unsigned int c = delete (del, dst) & constraint;
2247 if (c)
2249 copy (dst, tmp);
2250 merge_constrained (tmp, add, c, dst);
2254 /* Find the index of the state corresponding to the given position set with
2255 the given preceding context, or create a new state if there is no such
2256 state. Context tells whether we got here on a newline or letter. */
2257 static state_num
2258 state_index (struct dfa *d, position_set const *s, int context)
2260 size_t hash = 0;
2261 int constraint = 0;
2262 state_num i;
2264 for (i = 0; i < s->nelem; ++i)
2266 idx_t ind = s->elems[i].index;
2267 hash ^= ind + s->elems[i].constraint;
2270 /* Try to find a state that exactly matches the proposed one. */
2271 for (i = 0; i < d->sindex; ++i)
2273 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2274 || context != d->states[i].context)
2275 continue;
2276 state_num j;
2277 for (j = 0; j < s->nelem; ++j)
2278 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2279 || s->elems[j].index != d->states[i].elems.elems[j].index)
2280 break;
2281 if (j == s->nelem)
2282 return i;
2285 #ifdef DEBUG
2286 fprintf (stderr, "new state %td\n nextpos:", i);
2287 for (state_num j = 0; j < s->nelem; j++)
2289 fprintf (stderr, " %td:", s->elems[j].index);
2290 prtok (d->tokens[s->elems[j].index]);
2292 fprintf (stderr, "\n context:");
2293 if (context ^ CTX_ANY)
2295 if (context & CTX_NONE)
2296 fprintf (stderr, " CTX_NONE");
2297 if (context & CTX_LETTER)
2298 fprintf (stderr, " CTX_LETTER");
2299 if (context & CTX_NEWLINE)
2300 fprintf (stderr, " CTX_NEWLINE");
2302 else
2303 fprintf (stderr, " CTX_ANY");
2304 fprintf (stderr, "\n");
2305 #endif
2307 for (state_num j = 0; j < s->nelem; j++)
2309 int c = d->constraints[s->elems[j].index];
2311 if (c != 0)
2313 if (succeeds_in_context (c, context, CTX_ANY))
2314 constraint |= c;
2316 else if (d->tokens[s->elems[j].index] == BACKREF)
2317 constraint = NO_CONSTRAINT;
2321 /* Create a new state. */
2322 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2323 sizeof *d->states);
2324 d->states[i].hash = hash;
2325 alloc_position_set (&d->states[i].elems, s->nelem);
2326 copy (s, &d->states[i].elems);
2327 d->states[i].context = context;
2328 d->states[i].constraint = constraint;
2329 d->states[i].mbps.nelem = 0;
2330 d->states[i].mbps.elems = NULL;
2331 d->states[i].mb_trindex = -1;
2333 ++d->sindex;
2335 return i;
2338 /* Find the epsilon closure of D's set of positions. If any position of the set
2339 contains a symbol that matches the empty string in some context, replace
2340 that position with the elements of its follow labeled with an appropriate
2341 constraint. Repeat exhaustively until no funny positions are left.
2342 S->elems must be large enough to hold the result. BACKWARD is D's
2343 backward set; use and update it too. */
2344 static void
2345 epsclosure (struct dfa const *d, position_set *backward)
2347 position_set tmp;
2348 alloc_position_set (&tmp, d->nleaves);
2349 for (idx_t i = 0; i < d->tindex; i++)
2350 if (0 < d->follows[i].nelem)
2352 unsigned int constraint;
2353 switch (d->tokens[i])
2355 default:
2356 continue;
2358 case BEGLINE:
2359 constraint = BEGLINE_CONSTRAINT;
2360 break;
2361 case ENDLINE:
2362 constraint = ENDLINE_CONSTRAINT;
2363 break;
2364 case BEGWORD:
2365 constraint = BEGWORD_CONSTRAINT;
2366 break;
2367 case ENDWORD:
2368 constraint = ENDWORD_CONSTRAINT;
2369 break;
2370 case LIMWORD:
2371 constraint = LIMWORD_CONSTRAINT;
2372 break;
2373 case NOTLIMWORD:
2374 constraint = NOTLIMWORD_CONSTRAINT;
2375 break;
2376 case EMPTY:
2377 constraint = NO_CONSTRAINT;
2378 break;
2381 delete (i, &d->follows[i]);
2383 for (idx_t j = 0; j < backward[i].nelem; j++)
2384 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2385 constraint, &tmp);
2386 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2387 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2388 NO_CONSTRAINT, &tmp);
2390 free (tmp.elems);
2393 /* Returns the set of contexts for which there is at least one
2394 character included in C. */
2396 static int
2397 charclass_context (struct dfa const *dfa, charclass const *c)
2399 int context = 0;
2401 for (int j = 0; j < CHARCLASS_WORDS; j++)
2403 if (c->w[j] & dfa->syntax.newline.w[j])
2404 context |= CTX_NEWLINE;
2405 if (c->w[j] & dfa->syntax.letters.w[j])
2406 context |= CTX_LETTER;
2407 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2408 context |= CTX_NONE;
2411 return context;
2414 /* Returns the contexts on which the position set S depends. Each context
2415 in the set of returned contexts (let's call it SC) may have a different
2416 follow set than other contexts in SC, and also different from the
2417 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2418 in the complement set will have the same follow set. */
2420 static int _GL_ATTRIBUTE_PURE
2421 state_separate_contexts (struct dfa *d, position_set const *s)
2423 int separate_contexts = 0;
2425 for (idx_t j = 0; j < s->nelem; j++)
2426 separate_contexts |= d->separates[s->elems[j].index];
2428 return separate_contexts;
2431 enum
2433 /* Single token is repeated. It is distinguished from non-repeated. */
2434 OPT_REPEAT = (1 << 0),
2436 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2437 node is not merged. */
2438 OPT_LPAREN = (1 << 1),
2440 /* Multiple branches are joined. The node is not merged. */
2441 OPT_RPAREN = (1 << 2),
2443 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2444 flag is turned on. */
2445 OPT_WALKED = (1 << 3),
2447 /* The node is queued. The node is not queued again. */
2448 OPT_QUEUED = (1 << 4)
2451 static void
2452 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2453 position_set *merged)
2455 position_set *follows = d->follows;
2456 idx_t nelem = 0;
2458 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2460 idx_t sindex = follows[tindex].elems[i].index;
2462 /* Skip the node as pruned in future. */
2463 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2464 if (iconstraint == 0)
2465 continue;
2467 if (d->tokens[follows[tindex].elems[i].index] <= END)
2469 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2470 continue;
2473 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2475 idx_t j;
2477 for (j = 0; j < nelem; j++)
2479 idx_t dindex = follows[tindex].elems[j].index;
2481 if (dindex == tindex)
2482 continue;
2484 if (follows[tindex].elems[j].constraint != iconstraint)
2485 continue;
2487 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2488 continue;
2490 if (d->tokens[sindex] != d->tokens[dindex])
2491 continue;
2493 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2494 continue;
2496 if (flags[sindex] & OPT_REPEAT)
2497 delete (sindex, &follows[sindex]);
2499 merge2 (&follows[dindex], &follows[sindex], merged);
2501 break;
2504 if (j < nelem)
2505 continue;
2508 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2509 flags[sindex] |= OPT_QUEUED;
2512 follows[tindex].nelem = nelem;
2515 static int
2516 compare (const void *a, const void *b)
2518 position const *p = a, *q = b;
2519 return (p->index > q->index) - (p->index < q->index);
2522 static void
2523 reorder_tokens (struct dfa *d)
2525 idx_t nleaves = 0;
2526 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2527 map[0] = nleaves++;
2528 for (idx_t i = 1; i < d->tindex; i++)
2529 map[i] = -1;
2531 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2532 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2533 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2534 char *multibyte_prop = (d->localeinfo.multibyte
2535 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2536 : NULL);
2538 for (idx_t i = 0; i < d->tindex; i++)
2540 if (map[i] < 0)
2542 free (d->follows[i].elems);
2543 d->follows[i].elems = NULL;
2544 d->follows[i].nelem = 0;
2545 continue;
2548 tokens[map[i]] = d->tokens[i];
2549 follows[map[i]] = d->follows[i];
2550 constraints[map[i]] = d->constraints[i];
2552 if (multibyte_prop != NULL)
2553 multibyte_prop[map[i]] = d->multibyte_prop[i];
2555 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2557 if (map[d->follows[i].elems[j].index] == -1)
2558 map[d->follows[i].elems[j].index] = nleaves++;
2560 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2563 qsort (d->follows[i].elems, d->follows[i].nelem,
2564 sizeof *d->follows[i].elems, compare);
2567 for (idx_t i = 0; i < nleaves; i++)
2569 d->tokens[i] = tokens[i];
2570 d->follows[i] = follows[i];
2571 d->constraints[i] = constraints[i];
2573 if (multibyte_prop != NULL)
2574 d->multibyte_prop[i] = multibyte_prop[i];
2577 d->tindex = d->nleaves = nleaves;
2579 free (tokens);
2580 free (follows);
2581 free (constraints);
2582 free (multibyte_prop);
2583 free (map);
2586 static void
2587 dfaoptimize (struct dfa *d)
2589 char *flags = xizalloc (d->tindex);
2591 for (idx_t i = 0; i < d->tindex; i++)
2593 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2595 if (d->follows[i].elems[j].index == i)
2596 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2597 else if (d->follows[i].elems[j].index < i)
2598 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2599 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2600 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2601 else
2602 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2606 flags[0] |= OPT_QUEUED;
2608 position_set merged0;
2609 position_set *merged = &merged0;
2610 alloc_position_set (merged, d->nleaves);
2612 d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2614 for (idx_t i = 0; i < d->tindex; i++)
2615 if (flags[i] & OPT_QUEUED)
2616 merge_nfa_state (d, i, flags, merged);
2618 reorder_tokens (d);
2620 free (merged->elems);
2621 free (flags);
2624 /* Perform bottom-up analysis on the parse tree, computing various functions.
2625 Note that at this point, we're pretending constructs like \< are real
2626 characters rather than constraints on what can follow them.
2628 Nullable: A node is nullable if it is at the root of a regexp that can
2629 match the empty string.
2630 * EMPTY leaves are nullable.
2631 * No other leaf is nullable.
2632 * A QMARK or STAR node is nullable.
2633 * A PLUS node is nullable if its argument is nullable.
2634 * A CAT node is nullable if both its arguments are nullable.
2635 * An OR node is nullable if either argument is nullable.
2637 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2638 that could correspond to the first character of a string matching the
2639 regexp rooted at the given node.
2640 * EMPTY leaves have empty firstpos.
2641 * The firstpos of a nonempty leaf is that leaf itself.
2642 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2643 argument.
2644 * The firstpos of a CAT node is the firstpos of the left argument, union
2645 the firstpos of the right if the left argument is nullable.
2646 * The firstpos of an OR node is the union of firstpos of each argument.
2648 Lastpos: The lastpos of a node is the set of positions that could
2649 correspond to the last character of a string matching the regexp at
2650 the given node.
2651 * EMPTY leaves have empty lastpos.
2652 * The lastpos of a nonempty leaf is that leaf itself.
2653 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2654 argument.
2655 * The lastpos of a CAT node is the lastpos of its right argument, union
2656 the lastpos of the left if the right argument is nullable.
2657 * The lastpos of an OR node is the union of the lastpos of each argument.
2659 Follow: The follow of a position is the set of positions that could
2660 correspond to the character following a character matching the node in
2661 a string matching the regexp. At this point we consider special symbols
2662 that match the empty string in some context to be just normal characters.
2663 Later, if we find that a special symbol is in a follow set, we will
2664 replace it with the elements of its follow, labeled with an appropriate
2665 constraint.
2666 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2667 the follow of every node in the lastpos.
2668 * Every node in the firstpos of the second argument of a CAT node is in
2669 the follow of every node in the lastpos of the first argument.
2671 Because of the postfix representation of the parse tree, the depth-first
2672 analysis is conveniently done by a linear scan with the aid of a stack.
2673 Sets are stored as arrays of the elements, obeying a stack-like allocation
2674 scheme; the number of elements in each set deeper in the stack can be
2675 used to determine the address of a particular set's array. */
2676 static void
2677 dfaanalyze (struct dfa *d, bool searchflag)
2679 /* Array allocated to hold position sets. */
2680 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2681 /* Firstpos and lastpos elements. */
2682 position *firstpos = posalloc;
2683 position *lastpos = firstpos + d->nleaves;
2684 position pos;
2685 position_set tmp;
2687 /* Stack for element counts and nullable flags. */
2688 struct
2690 /* Whether the entry is nullable. */
2691 bool nullable;
2693 /* Counts of firstpos and lastpos sets. */
2694 idx_t nfirstpos;
2695 idx_t nlastpos;
2696 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2698 position_set merged; /* Result of merging sets. */
2700 addtok (d, CAT);
2701 idx_t tindex = d->tindex;
2703 #ifdef DEBUG
2704 fprintf (stderr, "dfaanalyze:\n");
2705 for (idx_t i = 0; i < tindex; i++)
2707 fprintf (stderr, " %td:", i);
2708 prtok (d->tokens[i]);
2710 putc ('\n', stderr);
2711 #endif
2713 d->searchflag = searchflag;
2714 alloc_position_set (&merged, d->nleaves);
2715 d->follows = xicalloc (tindex, sizeof *d->follows);
2716 position_set *backward
2717 = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2719 for (idx_t i = 0; i < tindex; i++)
2721 switch (d->tokens[i])
2723 case EMPTY:
2724 /* The empty set is nullable. */
2725 stk->nullable = true;
2727 /* The firstpos and lastpos of the empty leaf are both empty. */
2728 stk->nfirstpos = stk->nlastpos = 0;
2729 stk++;
2730 break;
2732 case STAR:
2733 case PLUS:
2734 /* Every element in the lastpos of the argument is in the backward
2735 set of every element in the firstpos. */
2736 if (d->epsilon)
2738 tmp.elems = lastpos - stk[-1].nlastpos;
2739 tmp.nelem = stk[-1].nlastpos;
2740 for (position *p = firstpos - stk[-1].nfirstpos;
2741 p < firstpos; p++)
2742 merge2 (&backward[p->index], &tmp, &merged);
2745 /* Every element in the firstpos of the argument is in the follow
2746 of every element in the lastpos. */
2748 tmp.elems = firstpos - stk[-1].nfirstpos;
2749 tmp.nelem = stk[-1].nfirstpos;
2750 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2751 merge2 (&d->follows[p->index], &tmp, &merged);
2753 FALLTHROUGH;
2754 case QMARK:
2755 /* A QMARK or STAR node is automatically nullable. */
2756 if (d->tokens[i] != PLUS)
2757 stk[-1].nullable = true;
2758 break;
2760 case CAT:
2761 /* Every element in the lastpos of the first argument is in
2762 the backward set of every element in the firstpos of the
2763 second argument. */
2764 if (backward)
2766 tmp.nelem = stk[-2].nlastpos;
2767 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2768 for (position *p = firstpos - stk[-1].nfirstpos;
2769 p < firstpos; p++)
2770 merge2 (&backward[p->index], &tmp, &merged);
2773 /* Every element in the firstpos of the second argument is in the
2774 follow of every element in the lastpos of the first argument. */
2776 tmp.nelem = stk[-1].nfirstpos;
2777 tmp.elems = firstpos - stk[-1].nfirstpos;
2778 for (position *plim = lastpos - stk[-1].nlastpos,
2779 *p = plim - stk[-2].nlastpos;
2780 p < plim; p++)
2781 merge2 (&d->follows[p->index], &tmp, &merged);
2784 /* The firstpos of a CAT node is the firstpos of the first argument,
2785 union that of the second argument if the first is nullable. */
2786 if (stk[-2].nullable)
2787 stk[-2].nfirstpos += stk[-1].nfirstpos;
2788 else
2789 firstpos -= stk[-1].nfirstpos;
2791 /* The lastpos of a CAT node is the lastpos of the second argument,
2792 union that of the first argument if the second is nullable. */
2793 if (stk[-1].nullable)
2794 stk[-2].nlastpos += stk[-1].nlastpos;
2795 else
2797 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2798 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2799 p[j] = p[j + stk[-2].nlastpos];
2800 lastpos -= stk[-2].nlastpos;
2801 stk[-2].nlastpos = stk[-1].nlastpos;
2804 /* A CAT node is nullable if both arguments are nullable. */
2805 stk[-2].nullable &= stk[-1].nullable;
2806 stk--;
2807 break;
2809 case OR:
2810 /* The firstpos is the union of the firstpos of each argument. */
2811 stk[-2].nfirstpos += stk[-1].nfirstpos;
2813 /* The lastpos is the union of the lastpos of each argument. */
2814 stk[-2].nlastpos += stk[-1].nlastpos;
2816 /* An OR node is nullable if either argument is nullable. */
2817 stk[-2].nullable |= stk[-1].nullable;
2818 stk--;
2819 break;
2821 default:
2822 /* Anything else is a nonempty position. (Note that special
2823 constructs like \< are treated as nonempty strings here;
2824 an "epsilon closure" effectively makes them nullable later.
2825 Backreferences have to get a real position so we can detect
2826 transitions on them later. But they are nullable. */
2827 stk->nullable = d->tokens[i] == BACKREF;
2829 /* This position is in its own firstpos and lastpos. */
2830 stk->nfirstpos = stk->nlastpos = 1;
2831 stk++;
2833 firstpos->index = lastpos->index = i;
2834 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2835 firstpos++, lastpos++;
2837 break;
2839 #ifdef DEBUG
2840 /* ... balance the above nonsyntactic #ifdef goo... */
2841 fprintf (stderr, "node %td:", i);
2842 prtok (d->tokens[i]);
2843 putc ('\n', stderr);
2844 fprintf (stderr,
2845 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2846 fprintf (stderr, " firstpos:");
2847 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2849 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2850 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2852 fprintf (stderr, "\n lastpos:");
2853 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2855 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2856 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2858 putc ('\n', stderr);
2859 #endif
2862 if (backward)
2864 /* For each follow set that is the follow set of a real position,
2865 replace it with its epsilon closure. */
2866 epsclosure (d, backward);
2868 for (idx_t i = 0; i < tindex; i++)
2869 free (backward[i].elems);
2870 free (backward);
2873 dfaoptimize (d);
2875 #ifdef DEBUG
2876 for (idx_t i = 0; i < tindex; i++)
2877 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2878 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2879 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2881 fprintf (stderr, "follows(%td:", i);
2882 prtok (d->tokens[i]);
2883 fprintf (stderr, "):");
2884 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2886 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2887 prtok (d->tokens[d->follows[i].elems[j].index]);
2889 putc ('\n', stderr);
2891 #endif
2893 pos.index = 0;
2894 pos.constraint = NO_CONSTRAINT;
2896 alloc_position_set (&tmp, 1);
2898 append (pos, &tmp);
2900 d->separates = xicalloc (tindex, sizeof *d->separates);
2902 for (idx_t i = 0; i < tindex; i++)
2904 if (prev_newline_dependent (d->constraints[i]))
2905 d->separates[i] |= CTX_NEWLINE;
2906 if (prev_letter_dependent (d->constraints[i]))
2907 d->separates[i] |= CTX_LETTER;
2909 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2911 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2912 d->separates[i] |= CTX_NEWLINE;
2913 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2914 d->separates[i] |= CTX_LETTER;
2918 /* Context wanted by some position. */
2919 int separate_contexts = state_separate_contexts (d, &tmp);
2921 /* Build the initial state. */
2922 if (separate_contexts & CTX_NEWLINE)
2923 state_index (d, &tmp, CTX_NEWLINE);
2924 d->initstate_notbol = d->min_trcount
2925 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2926 if (separate_contexts & CTX_LETTER)
2927 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2928 d->min_trcount++;
2929 d->trcount = 0;
2931 free (posalloc);
2932 free (stkalloc);
2933 free (merged.elems);
2934 free (tmp.elems);
2937 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2938 static void
2939 realloc_trans_if_necessary (struct dfa *d)
2941 state_num oldalloc = d->tralloc;
2942 if (oldalloc < d->sindex)
2944 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2945 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2946 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2947 -1, sizeof *realtrans);
2948 realtrans[0] = realtrans[1] = NULL;
2949 d->trans = realtrans + 2;
2950 idx_t newalloc = d->tralloc = newalloc1 - 2;
2951 d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2952 d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2953 d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2954 if (d->localeinfo.multibyte)
2956 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2957 realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2958 if (oldalloc == 0)
2959 realtrans[0] = realtrans[1] = NULL;
2960 d->mb_trans = realtrans + 2;
2962 for (; oldalloc < newalloc; oldalloc++)
2964 d->trans[oldalloc] = NULL;
2965 d->fails[oldalloc] = NULL;
2966 if (d->localeinfo.multibyte)
2967 d->mb_trans[oldalloc] = NULL;
2973 Calculate the transition table for a new state derived from state s
2974 for a compiled dfa d after input character uc, and return the new
2975 state number.
2977 Do not worry about all possible input characters; calculate just the group
2978 of positions that match uc. Label it with the set of characters that
2979 every position in the group matches (taking into account, if necessary,
2980 preceding context information of s). Then find the union
2981 of these positions' follows, i.e., the set of positions of the
2982 new state. For each character in the group's label, set the transition
2983 on this character to be to a state corresponding to the set's positions,
2984 and its associated backward context information, if necessary.
2986 When building a searching matcher, include the positions of state
2987 0 in every state.
2989 The group is constructed by building an equivalence-class
2990 partition of the positions of s.
2992 For each position, find the set of characters C that it matches. Eliminate
2993 any characters from C that fail on grounds of backward context.
2995 Check whether the group's label L has nonempty
2996 intersection with C. If L - C is nonempty, create a new group labeled
2997 L - C and having the same positions as the current group, and set L to
2998 the intersection of L and C. Insert the position in the group, set
2999 C = C - L, and resume scanning.
3001 If after comparing with every group there are characters remaining in C,
3002 create a new group labeled with the characters of C and insert this
3003 position in that group. */
3005 static state_num
3006 build_state (state_num s, struct dfa *d, unsigned char uc)
3008 position_set follows; /* Union of the follows for each
3009 position of the current state. */
3010 position_set group; /* Positions that match the input char. */
3011 position_set tmp; /* Temporary space for merging sets. */
3012 state_num state; /* New state. */
3013 state_num state_newline; /* New state on a newline transition. */
3014 state_num state_letter; /* New state on a letter transition. */
3016 #ifdef DEBUG
3017 fprintf (stderr, "build state %td\n", s);
3018 #endif
3020 /* A pointer to the new transition table, and the table itself. */
3021 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
3022 state_num *trans = *ptrans;
3024 if (!trans)
3026 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
3027 transition tables that can exist at once, other than for
3028 initial states. Often-used transition tables are quickly
3029 rebuilt, whereas rarely-used ones are cleared away. */
3030 if (MAX_TRCOUNT <= d->trcount)
3032 for (state_num i = d->min_trcount; i < d->tralloc; i++)
3034 free (d->trans[i]);
3035 free (d->fails[i]);
3036 d->trans[i] = d->fails[i] = NULL;
3038 d->trcount = 0;
3041 d->trcount++;
3042 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3044 /* Fill transition table with a default value which means that the
3045 transited state has not been calculated yet. */
3046 for (int i = 0; i < NOTCHAR; i++)
3047 trans[i] = -2;
3050 /* Set up the success bits for this state. */
3051 d->success[s] = 0;
3052 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3053 d->success[s] |= CTX_NEWLINE;
3054 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3055 d->success[s] |= CTX_LETTER;
3056 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3057 d->success[s] |= CTX_NONE;
3059 alloc_position_set (&follows, d->nleaves);
3061 /* Find the union of the follows of the positions of the group.
3062 This is a hideously inefficient loop. Fix it someday. */
3063 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3064 for (idx_t k = 0;
3065 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3066 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3067 &follows);
3069 /* Positions that match the input char. */
3070 alloc_position_set (&group, d->nleaves);
3072 /* The group's label. */
3073 charclass label;
3074 fillset (&label);
3076 for (idx_t i = 0; i < follows.nelem; i++)
3078 charclass matches; /* Set of matching characters. */
3079 position pos = follows.elems[i];
3080 bool matched = false;
3081 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3083 zeroset (&matches);
3084 setbit (d->tokens[pos.index], &matches);
3085 if (d->tokens[pos.index] == uc)
3086 matched = true;
3088 else if (d->tokens[pos.index] >= CSET)
3090 matches = d->charclasses[d->tokens[pos.index] - CSET];
3091 if (tstbit (uc, &matches))
3092 matched = true;
3094 else if (d->tokens[pos.index] == ANYCHAR)
3096 matches = d->charclasses[d->canychar];
3097 if (tstbit (uc, &matches))
3098 matched = true;
3100 /* ANYCHAR must match with a single character, so we must put
3101 it to D->states[s].mbps which contains the positions which
3102 can match with a single character not a byte. If all
3103 positions which has ANYCHAR does not depend on context of
3104 next character, we put the follows instead of it to
3105 D->states[s].mbps to optimize. */
3106 if (succeeds_in_context (pos.constraint, d->states[s].context,
3107 CTX_NONE))
3109 if (d->states[s].mbps.nelem == 0)
3110 alloc_position_set (&d->states[s].mbps, 1);
3111 insert (pos, &d->states[s].mbps);
3114 else
3115 continue;
3117 /* Some characters may need to be eliminated from matches because
3118 they fail in the current context. */
3119 if (pos.constraint != NO_CONSTRAINT)
3121 if (!succeeds_in_context (pos.constraint,
3122 d->states[s].context, CTX_NEWLINE))
3123 for (int j = 0; j < CHARCLASS_WORDS; j++)
3124 matches.w[j] &= ~d->syntax.newline.w[j];
3125 if (!succeeds_in_context (pos.constraint,
3126 d->states[s].context, CTX_LETTER))
3127 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3128 matches.w[j] &= ~d->syntax.letters.w[j];
3129 if (!succeeds_in_context (pos.constraint,
3130 d->states[s].context, CTX_NONE))
3131 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3132 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3134 /* If there are no characters left, there's no point in going on. */
3135 if (emptyset (&matches))
3136 continue;
3138 /* If we have reset the bit that made us declare "matched", reset
3139 that indicator, too. This is required to avoid an infinite loop
3140 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3141 if (!tstbit (uc, &matches))
3142 matched = false;
3145 #ifdef DEBUG
3146 fprintf (stderr, " nextpos %td:", pos.index);
3147 prtok (d->tokens[pos.index]);
3148 fprintf (stderr, " of");
3149 for (unsigned j = 0; j < NOTCHAR; j++)
3150 if (tstbit (j, &matches))
3151 fprintf (stderr, " 0x%02x", j);
3152 fprintf (stderr, "\n");
3153 #endif
3155 if (matched)
3157 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3158 label.w[k] &= matches.w[k];
3159 append (pos, &group);
3161 else
3163 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3164 label.w[k] &= ~matches.w[k];
3168 alloc_position_set (&tmp, d->nleaves);
3170 if (group.nelem > 0)
3172 /* If we are building a searching matcher, throw in the positions
3173 of state 0 as well, if possible. */
3174 if (d->searchflag)
3176 /* If a token in follows.elems is not 1st byte of a multibyte
3177 character, or the states of follows must accept the bytes
3178 which are not 1st byte of the multibyte character.
3179 Then, if a state of follows encounters a byte, it must not be
3180 a 1st byte of a multibyte character nor a single byte character.
3181 In this case, do not add state[0].follows to next state, because
3182 state[0] must accept 1st-byte.
3184 For example, suppose <sb a> is a certain single byte character,
3185 <mb A> is a certain multibyte character, and the codepoint of
3186 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3187 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3188 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3189 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3190 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3191 not add state[0]. */
3193 bool mergeit = !d->localeinfo.multibyte;
3194 if (!mergeit)
3196 mergeit = true;
3197 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3198 mergeit &= d->multibyte_prop[group.elems[j].index];
3200 if (mergeit)
3201 merge2 (&group, &d->states[0].elems, &tmp);
3204 /* Find out if the new state will want any context information,
3205 by calculating possible contexts that the group can match,
3206 and separate contexts that the new state wants to know. */
3207 int possible_contexts = charclass_context (d, &label);
3208 int separate_contexts = state_separate_contexts (d, &group);
3210 /* Find the state(s) corresponding to the union of the follows. */
3211 if (possible_contexts & ~separate_contexts)
3212 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3213 else
3214 state = -1;
3215 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3216 state_newline = state_index (d, &group, CTX_NEWLINE);
3217 else
3218 state_newline = state;
3219 if (separate_contexts & possible_contexts & CTX_LETTER)
3220 state_letter = state_index (d, &group, CTX_LETTER);
3221 else
3222 state_letter = state;
3224 /* Reallocate now, to reallocate any newline transition properly. */
3225 realloc_trans_if_necessary (d);
3228 /* If we are a searching matcher, the default transition is to a state
3229 containing the positions of state 0, otherwise the default transition
3230 is to fail miserably. */
3231 else if (d->searchflag)
3233 state_newline = 0;
3234 state_letter = d->min_trcount - 1;
3235 state = d->initstate_notbol;
3237 else
3239 state_newline = -1;
3240 state_letter = -1;
3241 state = -1;
3244 /* Set the transitions for each character in the label. */
3245 for (int i = 0; i < NOTCHAR; i++)
3246 if (tstbit (i, &label))
3247 switch (d->syntax.sbit[i])
3249 case CTX_NEWLINE:
3250 trans[i] = state_newline;
3251 break;
3252 case CTX_LETTER:
3253 trans[i] = state_letter;
3254 break;
3255 default:
3256 trans[i] = state;
3257 break;
3260 #ifdef DEBUG
3261 fprintf (stderr, "trans table %td", s);
3262 for (int i = 0; i < NOTCHAR; ++i)
3264 if (!(i & 0xf))
3265 fprintf (stderr, "\n");
3266 fprintf (stderr, " %2td", trans[i]);
3268 fprintf (stderr, "\n");
3269 #endif
3271 free (group.elems);
3272 free (follows.elems);
3273 free (tmp.elems);
3275 /* Keep the newline transition in a special place so we can use it as
3276 a sentinel. */
3277 if (tstbit (d->syntax.eolbyte, &label))
3279 d->newlines[s] = trans[d->syntax.eolbyte];
3280 trans[d->syntax.eolbyte] = -1;
3283 return trans[uc];
3286 /* Multibyte character handling sub-routines for dfaexec. */
3288 /* Consume a single byte and transit state from 's' to '*next_state'.
3289 This is almost the same as the state transition routine in dfaexec.
3290 But the state transition is done just once; otherwise, matching succeeds or
3291 we reach the end of the buffer. */
3292 static state_num
3293 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3295 state_num *t;
3297 if (d->trans[s])
3298 t = d->trans[s];
3299 else if (d->fails[s])
3300 t = d->fails[s];
3301 else
3303 build_state (s, d, **pp);
3304 if (d->trans[s])
3305 t = d->trans[s];
3306 else
3308 t = d->fails[s];
3309 assert (t);
3313 if (t[**pp] == -2)
3314 build_state (s, d, **pp);
3316 return t[*(*pp)++];
3319 /* Transit state from s, then return new state and update the pointer of
3320 the buffer. This function is for a period operator which can match a
3321 multi-byte character. */
3322 static state_num
3323 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3324 unsigned char const *end)
3326 wint_t wc;
3328 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3330 /* This state has some operators which can match a multibyte character. */
3331 d->mb_follows.nelem = 0;
3333 /* Calculate the state which can be reached from the state 's' by
3334 consuming 'mbclen' single bytes from the buffer. */
3335 state_num s1 = s;
3336 int mbci;
3337 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3338 s = transit_state_singlebyte (d, s, pp);
3339 *pp += mbclen - mbci;
3341 if (wc == WEOF)
3343 /* It is an invalid character, so ANYCHAR is not accepted. */
3344 return s;
3347 /* If all positions which have ANYCHAR do not depend on the context
3348 of the next character, calculate the next state with
3349 pre-calculated follows and cache the result. */
3350 if (d->states[s1].mb_trindex < 0)
3352 if (MAX_TRCOUNT <= d->mb_trcount)
3354 state_num s3;
3355 for (s3 = -1; s3 < d->tralloc; s3++)
3357 free (d->mb_trans[s3]);
3358 d->mb_trans[s3] = NULL;
3361 for (state_num i = 0; i < d->sindex; i++)
3362 d->states[i].mb_trindex = -1;
3363 d->mb_trcount = 0;
3365 d->states[s1].mb_trindex = d->mb_trcount++;
3368 if (! d->mb_trans[s])
3370 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3371 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3372 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3373 for (int i = 0; i < MAX_TRCOUNT; i++)
3374 d->mb_trans[s][i] = -1;
3376 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3377 return d->mb_trans[s][d->states[s1].mb_trindex];
3379 if (s == -1)
3380 copy (&d->states[s1].mbps, &d->mb_follows);
3381 else
3382 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3384 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3385 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3386 realloc_trans_if_necessary (d);
3388 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3390 return s2;
3393 /* The initial state may encounter a byte which is not a single byte character
3394 nor the first byte of a multibyte character. But it is incorrect for the
3395 initial state to accept such a byte. For example, in Shift JIS the regular
3396 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3397 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3398 that are not a single byte character nor the first byte of a multibyte
3399 character.
3401 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3402 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3403 the result is greater than P, set *WCP to the final wide character
3404 processed, or to WEOF if no wide character is processed. Otherwise,
3405 if WCP is non-NULL, *WCP may or may not be updated.
3407 Both P and MBP must be no larger than END. */
3408 static unsigned char const *
3409 skip_remains_mb (struct dfa *d, unsigned char const *p,
3410 unsigned char const *mbp, char const *end)
3412 if (d->syntax.never_trail[*p])
3413 return p;
3414 while (mbp < p)
3416 wint_t wc;
3417 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3418 end - (char const *) mbp, d);
3420 return mbp;
3423 /* Search through a buffer looking for a match to the struct dfa *D.
3424 Find the first occurrence of a string matching the regexp in the
3425 buffer, and the shortest possible version thereof. Return a pointer to
3426 the first character after the match, or NULL if none is found. BEGIN
3427 points to the beginning of the buffer, and END points to the first byte
3428 after its end. Note however that we store a sentinel byte (usually
3429 newline) in *END, so the actual buffer must be one byte longer.
3430 When ALLOW_NL, newlines may appear in the matching string.
3431 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3432 If MULTIBYTE, the input consists of multibyte characters and/or
3433 encoding-error bytes. Otherwise, it consists of single-byte characters.
3434 Here is the list of features that make this DFA matcher punt:
3435 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3436 - [^...] in non-simple locale
3437 - [[=foo=]] or [[.foo.]]
3438 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3439 - back-reference: (.)\1
3440 - word-delimiter in multibyte locale: \<, \>, \b, \B
3441 See struct localeinfo.simple for the definition of "simple locale". */
3443 static inline char *
3444 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3445 idx_t *count, bool multibyte)
3447 if (MAX_TRCOUNT <= d->sindex)
3449 for (state_num s = d->min_trcount; s < d->sindex; s++)
3451 free (d->states[s].elems.elems);
3452 free (d->states[s].mbps.elems);
3454 d->sindex = d->min_trcount;
3456 if (d->trans)
3458 for (state_num s = 0; s < d->tralloc; s++)
3460 free (d->trans[s]);
3461 free (d->fails[s]);
3462 d->trans[s] = d->fails[s] = NULL;
3464 d->trcount = 0;
3467 if (d->localeinfo.multibyte && d->mb_trans)
3469 for (state_num s = -1; s < d->tralloc; s++)
3471 free (d->mb_trans[s]);
3472 d->mb_trans[s] = NULL;
3474 for (state_num s = 0; s < d->min_trcount; s++)
3475 d->states[s].mb_trindex = -1;
3476 d->mb_trcount = 0;
3480 if (!d->tralloc)
3481 realloc_trans_if_necessary (d);
3483 /* Current state. */
3484 state_num s = 0, s1 = 0;
3486 /* Current input character. */
3487 unsigned char const *p = (unsigned char const *) begin;
3488 unsigned char const *mbp = p;
3490 /* Copy of d->trans so it can be optimized into a register. */
3491 state_num **trans = d->trans;
3492 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3493 unsigned char saved_end = *(unsigned char *) end;
3494 *end = eol;
3496 if (multibyte)
3498 mbszero (&d->mbs);
3499 if (d->mb_follows.alloc == 0)
3500 alloc_position_set (&d->mb_follows, d->nleaves);
3503 idx_t nlcount = 0;
3504 for (;;)
3506 state_num *t;
3507 while ((t = trans[s]) != NULL)
3509 if (s < d->min_trcount)
3511 if (!multibyte || d->states[s].mbps.nelem == 0)
3513 while (t[*p] == s)
3514 p++;
3516 if (multibyte)
3517 p = mbp = skip_remains_mb (d, p, mbp, end);
3520 if (multibyte)
3522 s1 = s;
3524 if (d->states[s].mbps.nelem == 0
3525 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3527 /* If an input character does not match ANYCHAR, do it
3528 like a single-byte character. */
3529 s = t[*p++];
3531 else
3533 s = transit_state (d, s, &p, (unsigned char *) end);
3534 mbp = p;
3535 trans = d->trans;
3538 else
3540 s1 = t[*p++];
3541 t = trans[s1];
3542 if (! t)
3544 state_num tmp = s;
3545 s = s1;
3546 s1 = tmp; /* swap */
3547 break;
3549 if (s < d->min_trcount)
3551 while (t[*p] == s1)
3552 p++;
3554 s = t[*p++];
3558 if (s < 0)
3560 if (s == -2)
3562 s = build_state (s1, d, p[-1]);
3563 trans = d->trans;
3565 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3567 /* The previous character was a newline. Count it, and skip
3568 checking of multibyte character boundary until here. */
3569 nlcount++;
3570 mbp = p;
3572 s = (allow_nl ? d->newlines[s1]
3573 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3574 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3575 : d->initstate_notbol);
3577 else
3579 p = NULL;
3580 goto done;
3583 else if (d->fails[s])
3585 if ((d->success[s] & d->syntax.sbit[*p])
3586 || ((char *) p == end
3587 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3588 d)))
3589 goto done;
3591 if (multibyte && s < d->min_trcount)
3592 p = mbp = skip_remains_mb (d, p, mbp, end);
3594 s1 = s;
3595 if (!multibyte || d->states[s].mbps.nelem == 0
3596 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3598 /* If a input character does not match ANYCHAR, do it
3599 like a single-byte character. */
3600 s = d->fails[s][*p++];
3602 else
3604 s = transit_state (d, s, &p, (unsigned char *) end);
3605 mbp = p;
3606 trans = d->trans;
3609 else
3611 build_state (s, d, p[0]);
3612 trans = d->trans;
3616 done:
3617 if (count)
3618 *count += nlcount;
3619 *end = saved_end;
3620 return (char *) p;
3623 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3624 This is for performance, as dfaexec_main is an inline function. */
3626 static char *
3627 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3628 bool allow_nl, idx_t *count, bool *backref)
3630 return dfaexec_main (d, begin, end, allow_nl, count, true);
3633 static char *
3634 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3635 bool allow_nl, idx_t *count, bool *backref)
3637 return dfaexec_main (d, begin, end, allow_nl, count, false);
3640 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3641 any regexp that uses a construct not supported by this code. */
3642 static char *
3643 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3644 bool allow_nl, idx_t *count, bool *backref)
3646 *backref = true;
3647 return (char *) begin;
3650 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3651 but faster and set *BACKREF if the DFA code does not support this
3652 regexp usage. */
3654 char *
3655 dfaexec (struct dfa *d, char const *begin, char *end,
3656 bool allow_nl, idx_t *count, bool *backref)
3658 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3661 struct dfa *
3662 dfasuperset (struct dfa const *d)
3664 return d->superset;
3667 bool
3668 dfaisfast (struct dfa const *d)
3670 return d->fast;
3673 static void
3674 free_mbdata (struct dfa *d)
3676 free (d->multibyte_prop);
3677 free (d->lex.brack.chars);
3678 free (d->mb_follows.elems);
3680 if (d->mb_trans)
3682 state_num s;
3683 for (s = -1; s < d->tralloc; s++)
3684 free (d->mb_trans[s]);
3685 free (d->mb_trans - 2);
3689 /* Return true if every construct in D is supported by this DFA matcher. */
3690 bool
3691 dfasupported (struct dfa const *d)
3693 for (idx_t i = 0; i < d->tindex; i++)
3695 switch (d->tokens[i])
3697 case BEGWORD:
3698 case ENDWORD:
3699 case LIMWORD:
3700 case NOTLIMWORD:
3701 if (!d->localeinfo.multibyte)
3702 continue;
3703 FALLTHROUGH;
3704 case BACKREF:
3705 case MBCSET:
3706 return false;
3709 return true;
3712 /* Disable use of the superset DFA if it is not likely to help
3713 performance. */
3714 static void
3715 maybe_disable_superset_dfa (struct dfa *d)
3717 if (!d->localeinfo.using_utf8)
3718 return;
3720 bool have_backref = false;
3721 for (idx_t i = 0; i < d->tindex; i++)
3723 switch (d->tokens[i])
3725 case ANYCHAR:
3726 /* Lowered. */
3727 assume (false);
3728 case BACKREF:
3729 have_backref = true;
3730 break;
3731 case MBCSET:
3732 /* Requires multi-byte algorithm. */
3733 return;
3734 default:
3735 break;
3739 if (!have_backref && d->superset)
3741 /* The superset DFA is not likely to be much faster, so remove it. */
3742 dfafree (d->superset);
3743 free (d->superset);
3744 d->superset = NULL;
3747 free_mbdata (d);
3748 d->localeinfo.multibyte = false;
3749 d->dfaexec = dfaexec_sb;
3750 d->fast = true;
3753 static void
3754 dfassbuild (struct dfa *d)
3756 struct dfa *sup = dfaalloc ();
3758 *sup = *d;
3759 sup->localeinfo.multibyte = false;
3760 sup->dfaexec = dfaexec_sb;
3761 sup->multibyte_prop = NULL;
3762 sup->superset = NULL;
3763 sup->states = NULL;
3764 sup->sindex = 0;
3765 sup->constraints = NULL;
3766 sup->separates = NULL;
3767 sup->follows = NULL;
3768 sup->tralloc = 0;
3769 sup->trans = NULL;
3770 sup->fails = NULL;
3771 sup->success = NULL;
3772 sup->newlines = NULL;
3774 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3775 if (d->cindex)
3777 memcpy (sup->charclasses, d->charclasses,
3778 d->cindex * sizeof *sup->charclasses);
3781 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3782 sup->talloc = d->tindex * 2;
3784 bool have_achar = false;
3785 bool have_nchar = false;
3786 idx_t j;
3787 for (idx_t i = j = 0; i < d->tindex; i++)
3789 switch (d->tokens[i])
3791 case ANYCHAR:
3792 case MBCSET:
3793 case BACKREF:
3795 charclass ccl;
3796 fillset (&ccl);
3797 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3798 sup->tokens[j++] = STAR;
3799 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3800 || d->tokens[i + 1] == PLUS)
3801 i++;
3802 have_achar = true;
3804 break;
3805 case BEGWORD:
3806 case ENDWORD:
3807 case LIMWORD:
3808 case NOTLIMWORD:
3809 if (d->localeinfo.multibyte)
3811 /* These constraints aren't supported in a multibyte locale.
3812 Ignore them in the superset DFA. */
3813 sup->tokens[j++] = EMPTY;
3814 break;
3816 FALLTHROUGH;
3817 default:
3818 sup->tokens[j++] = d->tokens[i];
3819 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3820 || d->tokens[i] >= CSET)
3821 have_nchar = true;
3822 break;
3825 sup->tindex = j;
3827 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3828 d->superset = sup;
3829 else
3831 dfafree (sup);
3832 free (sup);
3836 /* Parse a string S of length LEN into D (but skip this step if S is null).
3837 Then analyze D and build a matcher for it.
3838 SEARCHFLAG says whether to build a searching or an exact matcher. */
3839 void
3840 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3842 if (s != NULL)
3843 dfaparse (s, len, d);
3845 dfassbuild (d);
3847 if (dfasupported (d))
3849 maybe_disable_superset_dfa (d);
3850 dfaanalyze (d, searchflag);
3852 else
3854 d->dfaexec = dfaexec_noop;
3857 if (d->superset)
3859 d->fast = true;
3860 dfaanalyze (d->superset, searchflag);
3864 /* Free the storage held by the components of a dfa. */
3865 void
3866 dfafree (struct dfa *d)
3868 free (d->charclasses);
3869 free (d->tokens);
3871 if (d->localeinfo.multibyte)
3872 free_mbdata (d);
3874 free (d->constraints);
3875 free (d->separates);
3877 for (idx_t i = 0; i < d->sindex; i++)
3879 free (d->states[i].elems.elems);
3880 free (d->states[i].mbps.elems);
3882 free (d->states);
3884 if (d->follows)
3886 for (idx_t i = 0; i < d->tindex; i++)
3887 free (d->follows[i].elems);
3888 free (d->follows);
3891 if (d->trans)
3893 for (idx_t i = 0; i < d->tralloc; i++)
3895 free (d->trans[i]);
3896 free (d->fails[i]);
3899 free (d->trans - 2);
3900 free (d->fails);
3901 free (d->newlines);
3902 free (d->success);
3905 if (d->superset)
3907 dfafree (d->superset);
3908 free (d->superset);
3912 /* Having found the postfix representation of the regular expression,
3913 try to find a long sequence of characters that must appear in any line
3914 containing the r.e.
3915 Finding a "longest" sequence is beyond the scope here;
3916 we take an easy way out and hope for the best.
3917 (Take "(ab|a)b"--please.)
3919 We do a bottom-up calculation of sequences of characters that must appear
3920 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3921 representation:
3922 sequences that must appear at the left of the match ("left")
3923 sequences that must appear at the right of the match ("right")
3924 lists of sequences that must appear somewhere in the match ("in")
3925 sequences that must constitute the match ("is")
3927 When we get to the root of the tree, we use one of the longest of its
3928 calculated "in" sequences as our answer.
3930 The sequences calculated for the various types of node (in pseudo ANSI c)
3931 are shown below. "p" is the operand of unary operators (and the left-hand
3932 operand of binary operators); "q" is the right-hand operand of binary
3933 operators.
3935 "ZERO" means "a zero-length sequence" below.
3937 Type left right is in
3938 ---- ---- ----- -- --
3939 char c # c # c # c # c
3941 ANYCHAR ZERO ZERO ZERO ZERO
3943 MBCSET ZERO ZERO ZERO ZERO
3945 CSET ZERO ZERO ZERO ZERO
3947 STAR ZERO ZERO ZERO ZERO
3949 QMARK ZERO ZERO ZERO ZERO
3951 PLUS p->left p->right ZERO p->in
3953 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3954 p->left : q->right : q->is!=ZERO) ? q->in plus
3955 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3956 ZERO
3958 OR longest common longest common (do p->is and substrings common
3959 leading trailing to q->is have same p->in and
3960 (sub)sequence (sub)sequence q->in length and content) ?
3961 of p->left of p->right
3962 and q->left and q->right p->is : NULL
3964 If there's anything else we recognize in the tree, all four sequences get set
3965 to zero-length sequences. If there's something we don't recognize in the
3966 tree, we just return a zero-length sequence.
3968 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3969 'aaa')?
3971 And ... is it here or someplace that we might ponder "optimizations" such as
3972 egrep 'psi|epsilon' -> egrep 'psi'
3973 egrep 'pepsi|epsilon' -> egrep 'epsi'
3974 (Yes, we now find "epsi" as a "string
3975 that must occur", but we might also
3976 simplify the *entire* r.e. being sought)
3977 grep '[c]' -> grep 'c'
3978 grep '(ab|a)b' -> grep 'ab'
3979 grep 'ab*' -> grep 'a'
3980 grep 'a*b' -> grep 'b'
3982 There are several issues:
3984 Is optimization easy (enough)?
3986 Does optimization actually accomplish anything,
3987 or is the automaton you get from "psi|epsilon" (for example)
3988 the same as the one you get from "psi" (for example)?
3990 Are optimizable r.e.'s likely to be used in real-life situations
3991 (something like 'ab*' is probably unlikely; something like is
3992 'psi|epsilon' is likelier)? */
3994 static char *
3995 icatalloc (char *old, char const *new)
3997 idx_t newsize = strlen (new);
3998 if (newsize == 0)
3999 return old;
4000 idx_t oldsize = strlen (old);
4001 char *result = xirealloc (old, oldsize + newsize + 1);
4002 memcpy (result + oldsize, new, newsize + 1);
4003 return result;
4006 static void
4007 freelist (char **cpp)
4009 while (*cpp)
4010 free (*cpp++);
4013 static char **
4014 enlistnew (char **cpp, char *new)
4016 /* Is there already something in the list that's new (or longer)? */
4017 idx_t i;
4018 for (i = 0; cpp[i] != NULL; i++)
4019 if (strstr (cpp[i], new) != NULL)
4021 free (new);
4022 return cpp;
4024 /* Eliminate any obsoleted strings. */
4025 for (idx_t j = 0; cpp[j] != NULL; )
4026 if (strstr (new, cpp[j]) == NULL)
4027 ++j;
4028 else
4030 free (cpp[j]);
4031 if (--i == j)
4032 break;
4033 cpp[j] = cpp[i];
4034 cpp[i] = NULL;
4036 /* Add the new string. */
4037 cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4038 cpp[i] = new;
4039 cpp[i + 1] = NULL;
4040 return cpp;
4043 static char **
4044 enlist (char **cpp, char const *str, idx_t len)
4046 return enlistnew (cpp, ximemdup0 (str, len));
4049 /* Given pointers to two strings, return a pointer to an allocated
4050 list of their distinct common substrings. */
4051 static char **
4052 comsubs (char *left, char const *right)
4054 char **cpp = xzalloc (sizeof *cpp);
4056 for (char *lcp = left; *lcp != '\0'; lcp++)
4058 idx_t len = 0;
4059 char *rcp = strchr (right, *lcp);
4060 while (rcp != NULL)
4062 idx_t i;
4063 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4064 continue;
4065 if (i > len)
4066 len = i;
4067 rcp = strchr (rcp + 1, *lcp);
4069 if (len != 0)
4070 cpp = enlist (cpp, lcp, len);
4072 return cpp;
4075 static char **
4076 addlists (char **old, char **new)
4078 for (; *new; new++)
4079 old = enlistnew (old, xstrdup (*new));
4080 return old;
4083 /* Given two lists of substrings, return a new list giving substrings
4084 common to both. */
4085 static char **
4086 inboth (char **left, char **right)
4088 char **both = xzalloc (sizeof *both);
4090 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4092 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4094 char **temp = comsubs (left[lnum], right[rnum]);
4095 both = addlists (both, temp);
4096 freelist (temp);
4097 free (temp);
4100 return both;
4103 typedef struct must must;
4105 struct must
4107 char **in;
4108 char *left;
4109 char *right;
4110 char *is;
4111 bool begline;
4112 bool endline;
4113 must *prev;
4116 static must *
4117 allocmust (must *mp, idx_t size)
4119 must *new_mp = xmalloc (sizeof *new_mp);
4120 new_mp->in = xzalloc (sizeof *new_mp->in);
4121 new_mp->left = xizalloc (size);
4122 new_mp->right = xizalloc (size);
4123 new_mp->is = xizalloc (size);
4124 new_mp->begline = false;
4125 new_mp->endline = false;
4126 new_mp->prev = mp;
4127 return new_mp;
4130 static void
4131 resetmust (must *mp)
4133 freelist (mp->in);
4134 mp->in[0] = NULL;
4135 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4136 mp->begline = false;
4137 mp->endline = false;
4140 static void
4141 freemust (must *mp)
4143 freelist (mp->in);
4144 free (mp->in);
4145 free (mp->left);
4146 free (mp->right);
4147 free (mp->is);
4148 free (mp);
4151 struct dfamust *
4152 dfamust (struct dfa const *d)
4154 must *mp = NULL;
4155 char const *result = "";
4156 bool exact = false;
4157 bool begline = false;
4158 bool endline = false;
4159 bool need_begline = false;
4160 bool need_endline = false;
4161 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4163 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4165 token t = d->tokens[ri];
4166 switch (t)
4168 case BEGLINE:
4169 mp = allocmust (mp, 2);
4170 mp->begline = true;
4171 need_begline = true;
4172 break;
4173 case ENDLINE:
4174 mp = allocmust (mp, 2);
4175 mp->endline = true;
4176 need_endline = true;
4177 break;
4178 case LPAREN:
4179 case RPAREN:
4180 assert (!"neither LPAREN nor RPAREN may appear here");
4182 case EMPTY:
4183 case BEGWORD:
4184 case ENDWORD:
4185 case LIMWORD:
4186 case NOTLIMWORD:
4187 case BACKREF:
4188 case ANYCHAR:
4189 case MBCSET:
4190 mp = allocmust (mp, 2);
4191 break;
4193 case STAR:
4194 case QMARK:
4195 assume_nonnull (mp);
4196 resetmust (mp);
4197 break;
4199 case OR:
4201 char **new;
4202 must *rmp = mp;
4203 assume_nonnull (rmp);
4204 must *lmp = mp = mp->prev;
4205 assume_nonnull (lmp);
4206 idx_t j, ln, rn, n;
4208 /* Guaranteed to be. Unlikely, but ... */
4209 if (str_eq (lmp->is, rmp->is))
4211 lmp->begline &= rmp->begline;
4212 lmp->endline &= rmp->endline;
4214 else
4216 lmp->is[0] = '\0';
4217 lmp->begline = false;
4218 lmp->endline = false;
4220 /* Left side--easy */
4221 idx_t i = 0;
4222 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4223 ++i;
4224 lmp->left[i] = '\0';
4225 /* Right side */
4226 ln = strlen (lmp->right);
4227 rn = strlen (rmp->right);
4228 n = ln;
4229 if (n > rn)
4230 n = rn;
4231 for (i = 0; i < n; ++i)
4232 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4233 break;
4234 for (j = 0; j < i; ++j)
4235 lmp->right[j] = lmp->right[(ln - i) + j];
4236 lmp->right[j] = '\0';
4237 new = inboth (lmp->in, rmp->in);
4238 freelist (lmp->in);
4239 free (lmp->in);
4240 lmp->in = new;
4241 freemust (rmp);
4243 break;
4245 case PLUS:
4246 assume_nonnull (mp);
4247 mp->is[0] = '\0';
4248 break;
4250 case END:
4251 assume_nonnull (mp);
4252 assert (!mp->prev);
4253 for (idx_t i = 0; mp->in[i] != NULL; i++)
4254 if (strlen (mp->in[i]) > strlen (result))
4255 result = mp->in[i];
4256 if (str_eq (result, mp->is))
4258 if ((!need_begline || mp->begline) && (!need_endline
4259 || mp->endline))
4260 exact = true;
4261 begline = mp->begline;
4262 endline = mp->endline;
4264 goto done;
4266 case CAT:
4268 must *rmp = mp;
4269 assume_nonnull (rmp);
4270 must *lmp = mp = mp->prev;
4271 assume_nonnull (lmp);
4273 /* In. Everything in left, plus everything in
4274 right, plus concatenation of
4275 left's right and right's left. */
4276 lmp->in = addlists (lmp->in, rmp->in);
4277 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4279 idx_t lrlen = strlen (lmp->right);
4280 idx_t rllen = strlen (rmp->left);
4281 char *tp = ximalloc (lrlen + rllen + 1);
4282 memcpy (tp + lrlen, rmp->left, rllen + 1);
4283 memcpy (tp, lmp->right, lrlen);
4284 lmp->in = enlistnew (lmp->in, tp);
4286 /* Left-hand */
4287 if (lmp->is[0] != '\0')
4288 lmp->left = icatalloc (lmp->left, rmp->left);
4289 /* Right-hand */
4290 if (rmp->is[0] == '\0')
4291 lmp->right[0] = '\0';
4292 lmp->right = icatalloc (lmp->right, rmp->right);
4293 /* Guaranteed to be */
4294 if ((lmp->is[0] != '\0' || lmp->begline)
4295 && (rmp->is[0] != '\0' || rmp->endline))
4297 lmp->is = icatalloc (lmp->is, rmp->is);
4298 lmp->endline = rmp->endline;
4300 else
4302 lmp->is[0] = '\0';
4303 lmp->begline = false;
4304 lmp->endline = false;
4306 freemust (rmp);
4308 break;
4310 case '\0':
4311 /* Not on *my* shift. */
4312 goto done;
4314 default:
4315 if (CSET <= t)
4317 /* If T is a singleton, or if case-folding in a unibyte
4318 locale and T's members all case-fold to the same char,
4319 convert T to one of its members. Otherwise, do
4320 nothing further with T. */
4321 charclass *ccl = &d->charclasses[t - CSET];
4322 int j;
4323 for (j = 0; j < NOTCHAR; j++)
4324 if (tstbit (j, ccl))
4325 break;
4326 if (! (j < NOTCHAR))
4328 mp = allocmust (mp, 2);
4329 break;
4331 t = j;
4332 while (++j < NOTCHAR)
4333 if (tstbit (j, ccl)
4334 && ! (case_fold_unibyte
4335 && toupper (j) == toupper (t)))
4336 break;
4337 if (j < NOTCHAR)
4339 mp = allocmust (mp, 2);
4340 break;
4344 idx_t rj = ri + 2;
4345 if (d->tokens[ri + 1] == CAT)
4347 for (; rj < d->tindex - 1; rj += 2)
4349 if ((rj != ri && (d->tokens[rj] <= 0
4350 || NOTCHAR <= d->tokens[rj]))
4351 || d->tokens[rj + 1] != CAT)
4352 break;
4355 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4356 mp->is[0] = mp->left[0] = mp->right[0]
4357 = case_fold_unibyte ? toupper (t) : t;
4359 idx_t i;
4360 for (i = 1; ri + 2 < rj; i++)
4362 ri += 2;
4363 t = d->tokens[ri];
4364 mp->is[i] = mp->left[i] = mp->right[i]
4365 = case_fold_unibyte ? toupper (t) : t;
4367 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4368 mp->in = enlist (mp->in, mp->is, i);
4369 break;
4372 done:;
4374 struct dfamust *dm = NULL;
4375 if (*result)
4377 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4378 dm->exact = exact;
4379 dm->begline = begline;
4380 dm->endline = endline;
4381 strcpy (dm->must, result);
4384 while (mp)
4386 must *prev = mp->prev;
4387 freemust (mp);
4388 mp = prev;
4391 return dm;
4394 void
4395 dfamustfree (struct dfamust *dm)
4397 free (dm);
4400 struct dfa *
4401 dfaalloc (void)
4403 return xmalloc (sizeof (struct dfa));
4406 /* Initialize DFA. */
4407 void
4408 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4409 reg_syntax_t bits, int dfaopts)
4411 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4412 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4413 dfa->localeinfo = *linfo;
4415 dfa->fast = !dfa->localeinfo.multibyte;
4417 dfa->canychar = -1;
4418 dfa->syntax.syntax_bits_set = true;
4419 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4420 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4421 dfa->syntax.syntax_bits = bits;
4422 dfa->syntax.dfaopts = dfaopts;
4424 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4426 unsigned char uc = i;
4428 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4429 switch (dfa->syntax.sbit[uc])
4431 case CTX_LETTER:
4432 setbit (uc, &dfa->syntax.letters);
4433 break;
4434 case CTX_NEWLINE:
4435 setbit (uc, &dfa->syntax.newline);
4436 break;
4439 /* POSIX requires that the five bytes in "\n\r./" (including the
4440 terminating NUL) cannot occur inside a multibyte character. */
4441 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4442 ? (uc & 0xc0) != 0x80
4443 : strchr ("\n\r./", uc) != NULL);
4447 /* Initialize TO by copying FROM's syntax settings. */
4448 void
4449 dfacopysyntax (struct dfa *to, struct dfa const *from)
4451 memset (to, 0, offsetof (struct dfa, syntax));
4452 to->canychar = -1;
4453 to->fast = from->fast;
4454 to->syntax = from->syntax;
4455 to->dfaexec = from->dfaexec;
4456 to->localeinfo = from->localeinfo;
4459 /* vim:set shiftwidth=2: */