1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
25 /* For -w, we also consider _ to be word constituent. */
26 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
28 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
29 a list of strings, at least one of which is known to occur in
30 any string matching the regexp. */
33 /* DFA compiled regexp. */
34 static struct dfa dfa
;
36 /* The Regex compiled patterns. */
37 static struct patterns
39 /* Regex compiled regexp. */
40 struct re_pattern_buffer regexbuf
;
41 struct re_registers regs
; /* This is here on account of a BRAIN-DEAD
42 Q@#%!# library interface in regex.c. */
45 struct patterns
*patterns
;
49 dfaerror (char const *mesg
)
51 error (EXIT_TROUBLE
, 0, "%s", mesg
);
54 /* Tell static analyzers that this function does not return. */
58 /* Number of compiled fixed strings known to exactly match the regexp.
59 If kwsexec returns < kwset_exact_matches, then we don't need to
60 call the regexp matcher at all. */
61 static int kwset_exact_matches
;
64 kwsincr_case (const char *must
)
71 if (match_icase
&& MB_CUR_MAX
> 1)
72 buf
= mbtolower (must
, &n
);
76 return kwsincr (kwset
, buf
, n
);
79 /* If the DFA turns out to have some set of fixed strings one of
80 which must occur in the match, then we build a kwset matcher
81 to find those strings, and thus quickly filter out impossible
86 struct dfamust
const *dm
;
92 /* First, we compile in the substrings known to be exact
93 matches. The kwset matcher will return the index
94 of the matching string that it chooses. */
95 for (dm
= dfa
.musts
; dm
; dm
= dm
->next
)
99 ++kwset_exact_matches
;
100 if ((err
= kwsincr_case (dm
->must
)) != NULL
)
101 error (EXIT_TROUBLE
, 0, "%s", err
);
103 /* Now, we compile the substrings that will require
104 the use of the regexp matcher. */
105 for (dm
= dfa
.musts
; dm
; dm
= dm
->next
)
109 if ((err
= kwsincr_case (dm
->must
)) != NULL
)
110 error (EXIT_TROUBLE
, 0, "%s", err
);
112 if ((err
= kwsprep (kwset
)) != NULL
)
113 error (EXIT_TROUBLE
, 0, "%s", err
);
118 GEAcompile (char const *pattern
, size_t size
, reg_syntax_t syntax_bits
)
126 syntax_bits
|= RE_ICASE
;
127 re_set_syntax (syntax_bits
);
128 dfasyntax (syntax_bits
, match_icase
, eolbyte
);
130 /* For GNU regex compiler we have to pass the patterns separately to detect
131 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
132 GNU regex should have raise a syntax error. The same for backref, where
133 the backref should have been local to each pattern. */
138 sep
= memchr (p
, '\n', total
);
151 patterns
= realloc (patterns
, (pcount
+ 1) * sizeof (*patterns
));
152 if (patterns
== NULL
)
153 error (EXIT_TROUBLE
, errno
, _("memory exhausted"));
154 patterns
[pcount
] = patterns0
;
156 if ((err
= re_compile_pattern (p
, len
,
157 &(patterns
[pcount
].regexbuf
))) != NULL
)
158 error (EXIT_TROUBLE
, 0, "%s", err
);
162 } while (sep
&& total
!= 0);
164 /* In the match_words and match_lines cases, we use a different pattern
165 for the DFA matcher that will quickly throw out cases that won't work.
166 Then if DFA succeeds we do some hairy stuff using the regex matcher
167 to decide whether the match should really count. */
168 if (match_words
|| match_lines
)
170 static char const line_beg_no_bk
[] = "^(";
171 static char const line_end_no_bk
[] = ")$";
172 static char const word_beg_no_bk
[] = "(^|[^[:alnum:]_])(";
173 static char const word_end_no_bk
[] = ")([^[:alnum:]_]|$)";
174 static char const line_beg_bk
[] = "^\\(";
175 static char const line_end_bk
[] = "\\)$";
176 static char const word_beg_bk
[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
177 static char const word_end_bk
[] = "\\)\\([^[:alnum:]_]\\|$\\)";
178 int bk
= !(syntax_bits
& RE_NO_BK_PARENS
);
179 char *n
= xmalloc (sizeof word_beg_bk
- 1 + size
+ sizeof word_end_bk
);
181 strcpy (n
, match_lines
? (bk
? line_beg_bk
: line_beg_no_bk
)
182 : (bk
? word_beg_bk
: word_beg_no_bk
));
184 memcpy (n
+ total
, pattern
, size
);
186 strcpy (n
+ total
, match_lines
? (bk
? line_end_bk
: line_end_no_bk
)
187 : (bk
? word_end_bk
: word_end_no_bk
));
188 total
+= strlen (n
+ total
);
195 dfacomp (pattern
, size
, &dfa
, 1);
202 EGexecute (char const *buf
, size_t size
, size_t *match_size
,
203 char const *start_ptr
)
205 char const *buflim
, *beg
, *end
, *match
, *best_match
, *mb_start
;
207 int backref
, start
, len
, best_len
;
208 struct kwsmatch kwsm
;
215 /* mbtolower adds a NUL byte at the end. That will provide
216 space for the sentinel byte dfaexec may add. */
217 char *case_buf
= mbtolower (buf
, &size
);
219 start_ptr
= case_buf
+ (start_ptr
- buf
);
223 #endif /* MBS_SUPPORT */
228 for (beg
= end
= buf
; end
< buflim
; beg
= end
)
232 /* We don't care about an exact match. */
235 /* Find a possible match using the KWset matcher. */
236 size_t offset
= kwsexec (kwset
, beg
, buflim
- beg
, &kwsm
);
237 if (offset
== (size_t) -1)
240 /* Narrow down to the line containing the candidate, and
241 run it through DFA. */
242 if ((end
= memchr(beg
, eol
, buflim
- beg
)) != NULL
)
247 while (beg
> buf
&& beg
[-1] != eol
)
249 if (kwsm
.index
< kwset_exact_matches
)
255 || !is_mb_middle (&mb_start
, match
, buflim
,
260 if (dfaexec (&dfa
, beg
, (char *) end
, 0, NULL
, &backref
) == NULL
)
265 /* No good fixed strings; start with DFA. */
266 char const *next_beg
= dfaexec (&dfa
, beg
, (char *) buflim
,
268 if (next_beg
== NULL
)
270 /* Narrow down to the line we've found. */
272 if ((end
= memchr(beg
, eol
, buflim
- beg
)) != NULL
)
276 while (beg
> buf
&& beg
[-1] != eol
)
279 /* Successful, no backreferences encountered! */
285 /* We are looking for the leftmost (then longest) exact match.
286 We will go through the outer loop only once. */
291 /* If we've made it to this point, this means DFA has seen
292 a probable match, and we need to run it through Regex. */
295 for (i
= 0; i
< pcount
; i
++)
297 patterns
[i
].regexbuf
.not_eol
= 0;
298 if (0 <= (start
= re_search (&(patterns
[i
].regexbuf
),
300 beg
- buf
, end
- beg
- 1,
301 &(patterns
[i
].regs
))))
303 len
= patterns
[i
].regs
.end
[0] - start
;
305 if (match
> best_match
)
307 if (start_ptr
&& !match_words
)
308 goto assess_pattern_match
;
309 if ((!match_lines
&& !match_words
)
310 || (match_lines
&& len
== end
- beg
- 1))
314 goto assess_pattern_match
;
316 /* If -w, check if the match aligns with word boundaries.
317 We do this iteratively because:
318 (a) the line may contain more than one occurence of the
320 (b) Several alternatives in the pattern might be valid at a
321 given point, and we may need to consider a shorter one to
322 find a word boundary. */
324 while (match
<= best_match
)
326 if ((match
== buf
|| !WCHAR ((unsigned char) match
[-1]))
327 && (len
== end
- beg
- 1
328 || !WCHAR ((unsigned char) match
[len
])))
329 goto assess_pattern_match
;
332 /* Try a shorter length anchored at the same place. */
334 patterns
[i
].regexbuf
.not_eol
= 1;
335 len
= re_match (&(patterns
[i
].regexbuf
),
336 buf
, match
+ len
- beg
, match
- buf
,
337 &(patterns
[i
].regs
));
341 /* Try looking further on. */
342 if (match
== end
- 1)
345 patterns
[i
].regexbuf
.not_eol
= 0;
346 start
= re_search (&(patterns
[i
].regexbuf
),
348 match
- buf
, end
- match
- 1,
349 &(patterns
[i
].regs
));
352 len
= patterns
[i
].regs
.end
[0] - start
;
355 } /* while (match <= best_match) */
357 assess_pattern_match
:
360 /* Good enough for a non-exact match.
361 No need to look at further patterns, if any. */
364 if (match
< best_match
|| (match
== best_match
&& len
> best_len
))
366 /* Best exact match: leftmost, then longest. */
370 } /* if re_search >= 0 */
371 } /* for Regex patterns. */
372 if (best_match
< end
)
374 /* We have found an exact match. We were just
375 waiting for the best one (leftmost then longest). */
380 } /* for (beg = end ..) */