build: avoid another warning
[grep/ericb.git] / src / dfasearch.c
blob5de40b686d9701b4073b94f7549c858a48211d6c
1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
19 /* Written August 1992 by Mike Haertel. */
21 #include <config.h>
22 #include "search.h"
23 #include "dfa.h"
25 /* For -w, we also consider _ to be word constituent. */
26 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
28 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
29 a list of strings, at least one of which is known to occur in
30 any string matching the regexp. */
31 static kwset_t kwset;
33 /* DFA compiled regexp. */
34 static struct dfa dfa;
36 /* The Regex compiled patterns. */
37 static struct patterns
39 /* Regex compiled regexp. */
40 struct re_pattern_buffer regexbuf;
41 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
42 Q@#%!# library interface in regex.c. */
43 } patterns0;
45 struct patterns *patterns;
46 size_t pcount;
48 void
49 dfaerror (char const *mesg)
51 error (EXIT_TROUBLE, 0, "%s", mesg);
53 /* notreached */
54 /* Tell static analyzers that this function does not return. */
55 abort ();
58 /* Number of compiled fixed strings known to exactly match the regexp.
59 If kwsexec returns < kwset_exact_matches, then we don't need to
60 call the regexp matcher at all. */
61 static int kwset_exact_matches;
63 static char const *
64 kwsincr_case (const char *must)
66 const char *buf;
67 size_t n;
69 n = strlen (must);
70 #ifdef MBS_SUPPORT
71 if (match_icase && MB_CUR_MAX > 1)
72 buf = mbtolower (must, &n);
73 else
74 #endif
75 buf = must;
76 return kwsincr (kwset, buf, n);
79 /* If the DFA turns out to have some set of fixed strings one of
80 which must occur in the match, then we build a kwset matcher
81 to find those strings, and thus quickly filter out impossible
82 matches. */
83 static void
84 kwsmusts (void)
86 struct dfamust const *dm;
87 char const *err;
89 if (dfa.musts)
91 kwsinit (&kwset);
92 /* First, we compile in the substrings known to be exact
93 matches. The kwset matcher will return the index
94 of the matching string that it chooses. */
95 for (dm = dfa.musts; dm; dm = dm->next)
97 if (!dm->exact)
98 continue;
99 ++kwset_exact_matches;
100 if ((err = kwsincr_case (dm->must)) != NULL)
101 error (EXIT_TROUBLE, 0, "%s", err);
103 /* Now, we compile the substrings that will require
104 the use of the regexp matcher. */
105 for (dm = dfa.musts; dm; dm = dm->next)
107 if (dm->exact)
108 continue;
109 if ((err = kwsincr_case (dm->must)) != NULL)
110 error (EXIT_TROUBLE, 0, "%s", err);
112 if ((err = kwsprep (kwset)) != NULL)
113 error (EXIT_TROUBLE, 0, "%s", err);
117 void
118 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
120 const char *err;
121 const char *p, *sep;
122 size_t total = size;
123 char *motif;
125 if (match_icase)
126 syntax_bits |= RE_ICASE;
127 re_set_syntax (syntax_bits);
128 dfasyntax (syntax_bits, match_icase, eolbyte);
130 /* For GNU regex compiler we have to pass the patterns separately to detect
131 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
132 GNU regex should have raise a syntax error. The same for backref, where
133 the backref should have been local to each pattern. */
134 p = pattern;
137 size_t len;
138 sep = memchr (p, '\n', total);
139 if (sep)
141 len = sep - p;
142 sep++;
143 total -= (len + 1);
145 else
147 len = total;
148 total = 0;
151 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
152 if (patterns == NULL)
153 error (EXIT_TROUBLE, errno, _("memory exhausted"));
154 patterns[pcount] = patterns0;
156 if ((err = re_compile_pattern (p, len,
157 &(patterns[pcount].regexbuf))) != NULL)
158 error (EXIT_TROUBLE, 0, "%s", err);
159 pcount++;
161 p = sep;
162 } while (sep && total != 0);
164 /* In the match_words and match_lines cases, we use a different pattern
165 for the DFA matcher that will quickly throw out cases that won't work.
166 Then if DFA succeeds we do some hairy stuff using the regex matcher
167 to decide whether the match should really count. */
168 if (match_words || match_lines)
170 static char const line_beg_no_bk[] = "^(";
171 static char const line_end_no_bk[] = ")$";
172 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
173 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
174 static char const line_beg_bk[] = "^\\(";
175 static char const line_end_bk[] = "\\)$";
176 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
177 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
178 int bk = !(syntax_bits & RE_NO_BK_PARENS);
179 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
181 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
182 : (bk ? word_beg_bk : word_beg_no_bk));
183 total = strlen(n);
184 memcpy (n + total, pattern, size);
185 total += size;
186 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
187 : (bk ? word_end_bk : word_end_no_bk));
188 total += strlen (n + total);
189 pattern = motif = n;
190 size = total;
192 else
193 motif = NULL;
195 dfacomp (pattern, size, &dfa, 1);
196 kwsmusts ();
198 free(motif);
201 size_t
202 EGexecute (char const *buf, size_t size, size_t *match_size,
203 char const *start_ptr)
205 char const *buflim, *beg, *end, *match, *best_match, *mb_start;
206 char eol = eolbyte;
207 int backref, start, len, best_len;
208 struct kwsmatch kwsm;
209 size_t i, ret_val;
210 #ifdef MBS_SUPPORT
211 if (MB_CUR_MAX > 1)
213 if (match_icase)
215 /* mbtolower adds a NUL byte at the end. That will provide
216 space for the sentinel byte dfaexec may add. */
217 char *case_buf = mbtolower (buf, &size);
218 if (start_ptr)
219 start_ptr = case_buf + (start_ptr - buf);
220 buf = case_buf;
223 #endif /* MBS_SUPPORT */
225 mb_start = buf;
226 buflim = buf + size;
228 for (beg = end = buf; end < buflim; beg = end)
230 if (!start_ptr)
232 /* We don't care about an exact match. */
233 if (kwset)
235 /* Find a possible match using the KWset matcher. */
236 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
237 if (offset == (size_t) -1)
238 goto failure;
239 beg += offset;
240 /* Narrow down to the line containing the candidate, and
241 run it through DFA. */
242 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
243 end++;
244 else
245 end = buflim;
246 match = beg;
247 while (beg > buf && beg[-1] != eol)
248 --beg;
249 if (kwsm.index < kwset_exact_matches)
251 #ifdef MBS_SUPPORT
252 if (mb_start < beg)
253 mb_start = beg;
254 if (MB_CUR_MAX == 1
255 || !is_mb_middle (&mb_start, match, buflim,
256 kwsm.size[0]))
257 #endif
258 goto success;
260 if (dfaexec (&dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
261 continue;
263 else
265 /* No good fixed strings; start with DFA. */
266 char const *next_beg = dfaexec (&dfa, beg, (char *) buflim,
267 0, NULL, &backref);
268 if (next_beg == NULL)
269 break;
270 /* Narrow down to the line we've found. */
271 beg = next_beg;
272 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
273 end++;
274 else
275 end = buflim;
276 while (beg > buf && beg[-1] != eol)
277 --beg;
279 /* Successful, no backreferences encountered! */
280 if (!backref)
281 goto success;
283 else
285 /* We are looking for the leftmost (then longest) exact match.
286 We will go through the outer loop only once. */
287 beg = start_ptr;
288 end = buflim;
291 /* If we've made it to this point, this means DFA has seen
292 a probable match, and we need to run it through Regex. */
293 best_match = end;
294 best_len = 0;
295 for (i = 0; i < pcount; i++)
297 patterns[i].regexbuf.not_eol = 0;
298 if (0 <= (start = re_search (&(patterns[i].regexbuf),
299 buf, end - buf - 1,
300 beg - buf, end - beg - 1,
301 &(patterns[i].regs))))
303 len = patterns[i].regs.end[0] - start;
304 match = buf + start;
305 if (match > best_match)
306 continue;
307 if (start_ptr && !match_words)
308 goto assess_pattern_match;
309 if ((!match_lines && !match_words)
310 || (match_lines && len == end - beg - 1))
312 match = beg;
313 len = end - beg;
314 goto assess_pattern_match;
316 /* If -w, check if the match aligns with word boundaries.
317 We do this iteratively because:
318 (a) the line may contain more than one occurence of the
319 pattern, and
320 (b) Several alternatives in the pattern might be valid at a
321 given point, and we may need to consider a shorter one to
322 find a word boundary. */
323 if (match_words)
324 while (match <= best_match)
326 if ((match == buf || !WCHAR ((unsigned char) match[-1]))
327 && (len == end - beg - 1
328 || !WCHAR ((unsigned char) match[len])))
329 goto assess_pattern_match;
330 if (len > 0)
332 /* Try a shorter length anchored at the same place. */
333 --len;
334 patterns[i].regexbuf.not_eol = 1;
335 len = re_match (&(patterns[i].regexbuf),
336 buf, match + len - beg, match - buf,
337 &(patterns[i].regs));
339 if (len <= 0)
341 /* Try looking further on. */
342 if (match == end - 1)
343 break;
344 match++;
345 patterns[i].regexbuf.not_eol = 0;
346 start = re_search (&(patterns[i].regexbuf),
347 buf, end - buf - 1,
348 match - buf, end - match - 1,
349 &(patterns[i].regs));
350 if (start < 0)
351 break;
352 len = patterns[i].regs.end[0] - start;
353 match = buf + start;
355 } /* while (match <= best_match) */
356 continue;
357 assess_pattern_match:
358 if (!start_ptr)
360 /* Good enough for a non-exact match.
361 No need to look at further patterns, if any. */
362 goto success;
364 if (match < best_match || (match == best_match && len > best_len))
366 /* Best exact match: leftmost, then longest. */
367 best_match = match;
368 best_len = len;
370 } /* if re_search >= 0 */
371 } /* for Regex patterns. */
372 if (best_match < end)
374 /* We have found an exact match. We were just
375 waiting for the best one (leftmost then longest). */
376 beg = best_match;
377 len = best_len;
378 goto success_in_len;
380 } /* for (beg = end ..) */
382 failure:
383 ret_val = -1;
384 goto out;
386 success:
387 len = end - beg;
388 success_in_len:
389 *match_size = len;
390 ret_val = beg - buf;
391 out:
392 return ret_val;