1 /* Pattern Matchers for Regular Expressions.
2 Copyright (C) 1992, 1998, 2000, 2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
34 /* This must be included _after_ m-common.h: It depends on MBS_SUPPORT. */
37 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
38 # define IN_CTYPE_DOMAIN(c) 1
40 # define IN_CTYPE_DOMAIN(c) isascii(c)
42 #define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
44 struct compiled_regex
{
49 /* DFA compiled regexp. */
52 /* The Regex compiled patterns. */
55 /* Regex compiled regexp. */
56 struct re_pattern_buffer regexbuf
;
57 struct re_registers regs
; /* This is here on account of a BRAIN-DEAD
58 Q@#%!# library interface in regex.c. */
62 /* KWset compiled pattern. We compile a list of strings, at least one of
63 which is known to occur in any string matching the regexp. */
64 struct compiled_kwset ckwset
;
66 /* Number of compiled fixed strings known to exactly match the regexp.
67 If kwsexec returns < kwset_exact_matches, then we don't need to
68 call the regexp matcher at all. */
69 int kwset_exact_matches
;
72 /* Callback from dfa.c. */
74 dfaerror (const char *mesg
)
76 error (exit_failure
, 0, mesg
);
79 /* If the DFA turns out to have some set of fixed strings one of
80 which must occur in the match, then we build a kwset matcher
81 to find those strings, and thus quickly filter out impossible
84 kwsmusts (struct compiled_regex
*cregex
,
85 bool match_icase
, bool match_words
, bool match_lines
, char eolbyte
)
87 struct dfamust
const *dm
;
90 if (cregex
->dfa
.musts
)
92 kwsinit (&cregex
->ckwset
, match_icase
, match_words
, match_lines
, eolbyte
);
93 /* First, we compile in the substrings known to be exact
94 matches. The kwset matcher will return the index
95 of the matching string that it chooses. */
96 for (dm
= cregex
->dfa
.musts
; dm
; dm
= dm
->next
)
100 cregex
->kwset_exact_matches
++;
101 if ((err
= kwsincr (cregex
->ckwset
.kwset
, dm
->must
, strlen (dm
->must
))) != NULL
)
102 error (exit_failure
, 0, err
);
104 /* Now, we compile the substrings that will require
105 the use of the regexp matcher. */
106 for (dm
= cregex
->dfa
.musts
; dm
; dm
= dm
->next
)
110 if ((err
= kwsincr (cregex
->ckwset
.kwset
, dm
->must
, strlen (dm
->must
))) != NULL
)
111 error (exit_failure
, 0, err
);
113 if ((err
= kwsprep (cregex
->ckwset
.kwset
)) != NULL
)
114 error (exit_failure
, 0, err
);
119 Gcompile (const char *pattern
, size_t pattern_size
,
120 bool match_icase
, bool match_words
, bool match_lines
, char eolbyte
)
122 struct compiled_regex
*cregex
;
125 size_t total
= pattern_size
;
126 const char *motif
= pattern
;
128 cregex
= (struct compiled_regex
*) xmalloc (sizeof (struct compiled_regex
));
129 memset (cregex
, '\0', sizeof (struct compiled_regex
));
130 cregex
->match_words
= match_words
;
131 cregex
->match_lines
= match_lines
;
132 cregex
->eolbyte
= eolbyte
;
133 cregex
->patterns
= NULL
;
136 re_set_syntax (RE_SYNTAX_GREP
| RE_HAT_LISTS_NOT_NEWLINE
);
137 dfasyntax (RE_SYNTAX_GREP
| RE_HAT_LISTS_NOT_NEWLINE
, match_icase
, eolbyte
);
139 /* For GNU regex compiler we have to pass the patterns separately to detect
140 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
141 GNU regex should have raise a syntax error. The same for backref, where
142 the backref should have been local to each pattern. */
146 sep
= memchr (motif
, '\n', total
);
159 cregex
->patterns
= xrealloc (cregex
->patterns
, (cregex
->pcount
+ 1) * sizeof (struct patterns
));
160 memset (&cregex
->patterns
[cregex
->pcount
], '\0', sizeof (struct patterns
));
162 if ((err
= re_compile_pattern (motif
, len
,
163 &(cregex
->patterns
[cregex
->pcount
].regexbuf
))) != NULL
)
164 error (exit_failure
, 0, err
);
168 } while (sep
&& total
!= 0);
170 /* In the match_words and match_lines cases, we use a different pattern
171 for the DFA matcher that will quickly throw out cases that won't work.
172 Then if DFA succeeds we do some hairy stuff using the regex matcher
173 to decide whether the match should really count. */
174 if (match_words
|| match_lines
)
176 /* In the whole-word case, we use the pattern:
177 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
178 In the whole-line case, we use the pattern:
179 ^\(userpattern\)$. */
181 static const char line_beg
[] = "^\\(";
182 static const char line_end
[] = "\\)$";
183 static const char word_beg
[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
184 static const char word_end
[] = "\\)\\([^[:alnum:]_]\\|$\\)";
185 char *n
= (char *) xmalloc (sizeof word_beg
- 1 + pattern_size
+ sizeof word_end
);
187 strcpy (n
, match_lines
? line_beg
: word_beg
);
189 memcpy (n
+ i
, pattern
, pattern_size
);
191 strcpy (n
+ i
, match_lines
? line_end
: word_end
);
197 dfacomp (pattern
, pattern_size
, &cregex
->dfa
, 1);
198 kwsmusts (cregex
, match_icase
, match_words
, match_lines
, eolbyte
);
204 compile (const char *pattern
, size_t pattern_size
,
205 bool match_icase
, bool match_words
, bool match_lines
, char eolbyte
,
208 struct compiled_regex
*cregex
;
211 size_t total
= pattern_size
;
212 const char *motif
= pattern
;
214 cregex
= (struct compiled_regex
*) xmalloc (sizeof (struct compiled_regex
));
215 memset (cregex
, '\0', sizeof (struct compiled_regex
));
216 cregex
->match_words
= match_words
;
217 cregex
->match_lines
= match_lines
;
218 cregex
->eolbyte
= eolbyte
;
219 cregex
->patterns
= NULL
;
222 re_set_syntax (syntax
);
223 dfasyntax (syntax
, match_icase
, eolbyte
);
225 /* For GNU regex compiler we have to pass the patterns separately to detect
226 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
227 GNU regex should have raise a syntax error. The same for backref, where
228 the backref should have been local to each pattern. */
232 sep
= memchr (motif
, '\n', total
);
245 cregex
->patterns
= xrealloc (cregex
->patterns
, (cregex
->pcount
+ 1) * sizeof (struct patterns
));
246 memset (&cregex
->patterns
[cregex
->pcount
], '\0', sizeof (struct patterns
));
248 if ((err
= re_compile_pattern (motif
, len
,
249 &(cregex
->patterns
[cregex
->pcount
].regexbuf
))) != NULL
)
250 error (exit_failure
, 0, err
);
254 } while (sep
&& total
!= 0);
256 /* In the match_words and match_lines cases, we use a different pattern
257 for the DFA matcher that will quickly throw out cases that won't work.
258 Then if DFA succeeds we do some hairy stuff using the regex matcher
259 to decide whether the match should really count. */
260 if (match_words
|| match_lines
)
262 /* In the whole-word case, we use the pattern:
263 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
264 In the whole-line case, we use the pattern:
267 static const char line_beg
[] = "^(";
268 static const char line_end
[] = ")$";
269 static const char word_beg
[] = "(^|[^[:alnum:]_])(";
270 static const char word_end
[] = ")([^[:alnum:]_]|$)";
271 char *n
= (char *) xmalloc (sizeof word_beg
- 1 + pattern_size
+ sizeof word_end
);
273 strcpy (n
, match_lines
? line_beg
: word_beg
);
275 memcpy (n
+ i
, pattern
, pattern_size
);
277 strcpy (n
+ i
, match_lines
? line_end
: word_end
);
283 dfacomp (pattern
, pattern_size
, &cregex
->dfa
, 1);
284 kwsmusts (cregex
, match_icase
, match_words
, match_lines
, eolbyte
);
290 Ecompile (const char *pattern
, size_t pattern_size
,
291 bool match_icase
, bool match_words
, bool match_lines
, char eolbyte
)
293 return compile (pattern
, pattern_size
,
294 match_icase
, match_words
, match_lines
, eolbyte
,
295 RE_SYNTAX_POSIX_EGREP
);
299 AWKcompile (const char *pattern
, size_t pattern_size
,
300 bool match_icase
, bool match_words
, bool match_lines
, char eolbyte
)
302 return compile (pattern
, pattern_size
,
303 match_icase
, match_words
, match_lines
, eolbyte
,
308 EGexecute (const void *compiled_pattern
,
309 const char *buf
, size_t buf_size
,
310 size_t *match_size
, bool exact
)
312 struct compiled_regex
*cregex
= (struct compiled_regex
*) compiled_pattern
;
313 register const char *buflim
, *beg
, *end
;
314 char eol
= cregex
->eolbyte
;
315 int backref
, start
, len
;
316 struct kwsmatch kwsm
;
319 char *mb_properties
= NULL
;
320 #endif /* MBS_SUPPORT */
323 if (MB_CUR_MAX
> 1 && cregex
->ckwset
.kwset
)
324 mb_properties
= check_multibyte_string (buf
, buf_size
);
325 #endif /* MBS_SUPPORT */
327 buflim
= buf
+ buf_size
;
329 for (beg
= end
= buf
; end
< buflim
; beg
= end
)
333 if (cregex
->ckwset
.kwset
)
335 /* Find a possible match using the KWset matcher. */
336 size_t offset
= kwsexec (cregex
->ckwset
.kwset
, beg
, buflim
- beg
, &kwsm
);
337 if (offset
== (size_t) -1)
341 free (mb_properties
);
346 /* Narrow down to the line containing the candidate, and
347 run it through DFA. */
348 end
= memchr (beg
, eol
, buflim
- beg
);
354 if (MB_CUR_MAX
> 1 && mb_properties
[beg
- buf
] == 0)
357 while (beg
> buf
&& beg
[-1] != eol
)
359 if (kwsm
.index
< cregex
->kwset_exact_matches
)
361 if (dfaexec (&cregex
->dfa
, beg
, end
- beg
, &backref
) == (size_t) -1)
366 /* No good fixed strings; start with DFA. */
367 size_t offset
= dfaexec (&cregex
->dfa
, beg
, buflim
- beg
, &backref
);
368 if (offset
== (size_t) -1)
370 /* Narrow down to the line we've found. */
372 end
= memchr (beg
, eol
, buflim
- beg
);
377 while (beg
> buf
&& beg
[-1] != eol
)
380 /* Successful, no backreferences encountered! */
385 end
= beg
+ buf_size
;
387 /* If we've made it to this point, this means DFA has seen
388 a probable match, and we need to run it through Regex. */
389 for (i
= 0; i
< cregex
->pcount
; i
++)
391 cregex
->patterns
[i
].regexbuf
.not_eol
= 0;
392 if (0 <= (start
= re_search (&(cregex
->patterns
[i
].regexbuf
), beg
,
394 end
- beg
- 1, &(cregex
->patterns
[i
].regs
))))
396 len
= cregex
->patterns
[i
].regs
.end
[0] - start
;
402 if ((!cregex
->match_lines
&& !cregex
->match_words
)
403 || (cregex
->match_lines
&& len
== end
- beg
- 1))
405 /* If -w, check if the match aligns with word boundaries.
406 We do this iteratively because:
407 (a) the line may contain more than one occurence of the
409 (b) Several alternatives in the pattern might be valid at a
410 given point, and we may need to consider a shorter one to
411 find a word boundary. */
412 if (cregex
->match_words
)
415 if ((start
== 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg
[start
- 1]))
416 && (len
== end
- beg
- 1
417 || !IS_WORD_CONSTITUENT ((unsigned char) beg
[start
+ len
])))
421 /* Try a shorter length anchored at the same place. */
423 cregex
->patterns
[i
].regexbuf
.not_eol
= 1;
424 len
= re_match (&(cregex
->patterns
[i
].regexbuf
), beg
,
426 &(cregex
->patterns
[i
].regs
));
430 /* Try looking further on. */
431 if (start
== end
- beg
- 1)
434 cregex
->patterns
[i
].regexbuf
.not_eol
= 0;
435 start
= re_search (&(cregex
->patterns
[i
].regexbuf
), beg
,
437 start
, end
- beg
- 1 - start
,
438 &(cregex
->patterns
[i
].regs
));
439 len
= cregex
->patterns
[i
].regs
.end
[0] - start
;
443 } /* for Regex patterns. */
444 } /* for (beg = end ..) */
446 if (MB_CUR_MAX
> 1 && mb_properties
)
447 free (mb_properties
);
448 #endif /* MBS_SUPPORT */
453 if (MB_CUR_MAX
> 1 && mb_properties
)
454 free (mb_properties
);
455 #endif /* MBS_SUPPORT */
456 *match_size
= end
- beg
;
461 EGfree (void *compiled_pattern
)
463 struct compiled_regex
*cregex
= (struct compiled_regex
*) compiled_pattern
;
465 dfafree (&cregex
->dfa
);
466 free (cregex
->patterns
);
467 free (cregex
->ckwset
.trans
);
471 /* POSIX Basic Regular Expressions */
472 matcher_t matcher_grep
=
479 /* POSIX Extended Regular Expressions */
480 matcher_t matcher_egrep
=
487 /* AWK Regular Expressions */
488 matcher_t matcher_awk
=