Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / libgrep / m-regex.c
blob0af29a873786860d67b4dbf21e2e2874a64ee3c8
1 /* Pattern Matchers for Regular Expressions.
2 Copyright (C) 1992, 1998, 2000, 2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 #if HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 /* Specification. */
23 #include "libgrep.h"
25 #include <ctype.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #include "error.h"
30 #include "exitfail.h"
31 #include "xalloc.h"
32 #include "m-common.h"
34 /* This must be included _after_ m-common.h: It depends on MBS_SUPPORT. */
35 #include "dfa.h"
37 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
38 # define IN_CTYPE_DOMAIN(c) 1
39 #else
40 # define IN_CTYPE_DOMAIN(c) isascii(c)
41 #endif
42 #define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
44 struct compiled_regex {
45 bool match_words;
46 bool match_lines;
47 char eolbyte;
49 /* DFA compiled regexp. */
50 struct dfa dfa;
52 /* The Regex compiled patterns. */
53 struct patterns
55 /* Regex compiled regexp. */
56 struct re_pattern_buffer regexbuf;
57 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
58 Q@#%!# library interface in regex.c. */
59 } *patterns;
60 size_t pcount;
62 /* KWset compiled pattern. We compile a list of strings, at least one of
63 which is known to occur in any string matching the regexp. */
64 struct compiled_kwset ckwset;
66 /* Number of compiled fixed strings known to exactly match the regexp.
67 If kwsexec returns < kwset_exact_matches, then we don't need to
68 call the regexp matcher at all. */
69 int kwset_exact_matches;
72 /* Callback from dfa.c. */
73 void
74 dfaerror (const char *mesg)
76 error (exit_failure, 0, mesg);
79 /* If the DFA turns out to have some set of fixed strings one of
80 which must occur in the match, then we build a kwset matcher
81 to find those strings, and thus quickly filter out impossible
82 matches. */
83 static void
84 kwsmusts (struct compiled_regex *cregex,
85 bool match_icase, bool match_words, bool match_lines, char eolbyte)
87 struct dfamust const *dm;
88 const char *err;
90 if (cregex->dfa.musts)
92 kwsinit (&cregex->ckwset, match_icase, match_words, match_lines, eolbyte);
93 /* First, we compile in the substrings known to be exact
94 matches. The kwset matcher will return the index
95 of the matching string that it chooses. */
96 for (dm = cregex->dfa.musts; dm; dm = dm->next)
98 if (!dm->exact)
99 continue;
100 cregex->kwset_exact_matches++;
101 if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
102 error (exit_failure, 0, err);
104 /* Now, we compile the substrings that will require
105 the use of the regexp matcher. */
106 for (dm = cregex->dfa.musts; dm; dm = dm->next)
108 if (dm->exact)
109 continue;
110 if ((err = kwsincr (cregex->ckwset.kwset, dm->must, strlen (dm->must))) != NULL)
111 error (exit_failure, 0, err);
113 if ((err = kwsprep (cregex->ckwset.kwset)) != NULL)
114 error (exit_failure, 0, err);
118 static void *
119 Gcompile (const char *pattern, size_t pattern_size,
120 bool match_icase, bool match_words, bool match_lines, char eolbyte)
122 struct compiled_regex *cregex;
123 const char *err;
124 const char *sep;
125 size_t total = pattern_size;
126 const char *motif = pattern;
128 cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
129 memset (cregex, '\0', sizeof (struct compiled_regex));
130 cregex->match_words = match_words;
131 cregex->match_lines = match_lines;
132 cregex->eolbyte = eolbyte;
133 cregex->patterns = NULL;
134 cregex->pcount = 0;
136 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
137 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
139 /* For GNU regex compiler we have to pass the patterns separately to detect
140 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
141 GNU regex should have raise a syntax error. The same for backref, where
142 the backref should have been local to each pattern. */
145 size_t len;
146 sep = memchr (motif, '\n', total);
147 if (sep)
149 len = sep - motif;
150 sep++;
151 total -= (len + 1);
153 else
155 len = total;
156 total = 0;
159 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
160 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
162 if ((err = re_compile_pattern (motif, len,
163 &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
164 error (exit_failure, 0, err);
165 cregex->pcount++;
167 motif = sep;
168 } while (sep && total != 0);
170 /* In the match_words and match_lines cases, we use a different pattern
171 for the DFA matcher that will quickly throw out cases that won't work.
172 Then if DFA succeeds we do some hairy stuff using the regex matcher
173 to decide whether the match should really count. */
174 if (match_words || match_lines)
176 /* In the whole-word case, we use the pattern:
177 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
178 In the whole-line case, we use the pattern:
179 ^\(userpattern\)$. */
181 static const char line_beg[] = "^\\(";
182 static const char line_end[] = "\\)$";
183 static const char word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
184 static const char word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
185 char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
186 size_t i;
187 strcpy (n, match_lines ? line_beg : word_beg);
188 i = strlen (n);
189 memcpy (n + i, pattern, pattern_size);
190 i += pattern_size;
191 strcpy (n + i, match_lines ? line_end : word_end);
192 i += strlen (n + i);
193 pattern = n;
194 pattern_size = i;
197 dfacomp (pattern, pattern_size, &cregex->dfa, 1);
198 kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
200 return cregex;
203 static void *
204 compile (const char *pattern, size_t pattern_size,
205 bool match_icase, bool match_words, bool match_lines, char eolbyte,
206 reg_syntax_t syntax)
208 struct compiled_regex *cregex;
209 const char *err;
210 const char *sep;
211 size_t total = pattern_size;
212 const char *motif = pattern;
214 cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
215 memset (cregex, '\0', sizeof (struct compiled_regex));
216 cregex->match_words = match_words;
217 cregex->match_lines = match_lines;
218 cregex->eolbyte = eolbyte;
219 cregex->patterns = NULL;
220 cregex->pcount = 0;
222 re_set_syntax (syntax);
223 dfasyntax (syntax, match_icase, eolbyte);
225 /* For GNU regex compiler we have to pass the patterns separately to detect
226 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
227 GNU regex should have raise a syntax error. The same for backref, where
228 the backref should have been local to each pattern. */
231 size_t len;
232 sep = memchr (motif, '\n', total);
233 if (sep)
235 len = sep - motif;
236 sep++;
237 total -= (len + 1);
239 else
241 len = total;
242 total = 0;
245 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
246 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
248 if ((err = re_compile_pattern (motif, len,
249 &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
250 error (exit_failure, 0, err);
251 cregex->pcount++;
253 motif = sep;
254 } while (sep && total != 0);
256 /* In the match_words and match_lines cases, we use a different pattern
257 for the DFA matcher that will quickly throw out cases that won't work.
258 Then if DFA succeeds we do some hairy stuff using the regex matcher
259 to decide whether the match should really count. */
260 if (match_words || match_lines)
262 /* In the whole-word case, we use the pattern:
263 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
264 In the whole-line case, we use the pattern:
265 ^(userpattern)$. */
267 static const char line_beg[] = "^(";
268 static const char line_end[] = ")$";
269 static const char word_beg[] = "(^|[^[:alnum:]_])(";
270 static const char word_end[] = ")([^[:alnum:]_]|$)";
271 char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
272 size_t i;
273 strcpy (n, match_lines ? line_beg : word_beg);
274 i = strlen(n);
275 memcpy (n + i, pattern, pattern_size);
276 i += pattern_size;
277 strcpy (n + i, match_lines ? line_end : word_end);
278 i += strlen (n + i);
279 pattern = n;
280 pattern_size = i;
283 dfacomp (pattern, pattern_size, &cregex->dfa, 1);
284 kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);
286 return cregex;
289 static void *
290 Ecompile (const char *pattern, size_t pattern_size,
291 bool match_icase, bool match_words, bool match_lines, char eolbyte)
293 return compile (pattern, pattern_size,
294 match_icase, match_words, match_lines, eolbyte,
295 RE_SYNTAX_POSIX_EGREP);
298 static void *
299 AWKcompile (const char *pattern, size_t pattern_size,
300 bool match_icase, bool match_words, bool match_lines, char eolbyte)
302 return compile (pattern, pattern_size,
303 match_icase, match_words, match_lines, eolbyte,
304 RE_SYNTAX_AWK);
307 static size_t
308 EGexecute (const void *compiled_pattern,
309 const char *buf, size_t buf_size,
310 size_t *match_size, bool exact)
312 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
313 register const char *buflim, *beg, *end;
314 char eol = cregex->eolbyte;
315 int backref, start, len;
316 struct kwsmatch kwsm;
317 size_t i;
318 #ifdef MBS_SUPPORT
319 char *mb_properties = NULL;
320 #endif /* MBS_SUPPORT */
322 #ifdef MBS_SUPPORT
323 if (MB_CUR_MAX > 1 && cregex->ckwset.kwset)
324 mb_properties = check_multibyte_string (buf, buf_size);
325 #endif /* MBS_SUPPORT */
327 buflim = buf + buf_size;
329 for (beg = end = buf; end < buflim; beg = end)
331 if (!exact)
333 if (cregex->ckwset.kwset)
335 /* Find a possible match using the KWset matcher. */
336 size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm);
337 if (offset == (size_t) -1)
339 #ifdef MBS_SUPPORT
340 if (MB_CUR_MAX > 1)
341 free (mb_properties);
342 #endif
343 return (size_t)-1;
345 beg += offset;
346 /* Narrow down to the line containing the candidate, and
347 run it through DFA. */
348 end = memchr (beg, eol, buflim - beg);
349 if (end != NULL)
350 end++;
351 else
352 end = buflim;
353 #ifdef MBS_SUPPORT
354 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
355 continue;
356 #endif
357 while (beg > buf && beg[-1] != eol)
358 --beg;
359 if (kwsm.index < cregex->kwset_exact_matches)
360 goto success;
361 if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1)
362 continue;
364 else
366 /* No good fixed strings; start with DFA. */
367 size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref);
368 if (offset == (size_t) -1)
369 break;
370 /* Narrow down to the line we've found. */
371 beg += offset;
372 end = memchr (beg, eol, buflim - beg);
373 if (end != NULL)
374 end++;
375 else
376 end = buflim;
377 while (beg > buf && beg[-1] != eol)
378 --beg;
380 /* Successful, no backreferences encountered! */
381 if (!backref)
382 goto success;
384 else
385 end = beg + buf_size;
387 /* If we've made it to this point, this means DFA has seen
388 a probable match, and we need to run it through Regex. */
389 for (i = 0; i < cregex->pcount; i++)
391 cregex->patterns[i].regexbuf.not_eol = 0;
392 if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg,
393 end - beg - 1, 0,
394 end - beg - 1, &(cregex->patterns[i].regs))))
396 len = cregex->patterns[i].regs.end[0] - start;
397 if (exact)
399 *match_size = len;
400 return start;
402 if ((!cregex->match_lines && !cregex->match_words)
403 || (cregex->match_lines && len == end - beg - 1))
404 goto success;
405 /* If -w, check if the match aligns with word boundaries.
406 We do this iteratively because:
407 (a) the line may contain more than one occurence of the
408 pattern, and
409 (b) Several alternatives in the pattern might be valid at a
410 given point, and we may need to consider a shorter one to
411 find a word boundary. */
412 if (cregex->match_words)
413 while (start >= 0)
415 if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
416 && (len == end - beg - 1
417 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
418 goto success;
419 if (len > 0)
421 /* Try a shorter length anchored at the same place. */
422 --len;
423 cregex->patterns[i].regexbuf.not_eol = 1;
424 len = re_match (&(cregex->patterns[i].regexbuf), beg,
425 start + len, start,
426 &(cregex->patterns[i].regs));
428 if (len <= 0)
430 /* Try looking further on. */
431 if (start == end - beg - 1)
432 break;
433 ++start;
434 cregex->patterns[i].regexbuf.not_eol = 0;
435 start = re_search (&(cregex->patterns[i].regexbuf), beg,
436 end - beg - 1,
437 start, end - beg - 1 - start,
438 &(cregex->patterns[i].regs));
439 len = cregex->patterns[i].regs.end[0] - start;
443 } /* for Regex patterns. */
444 } /* for (beg = end ..) */
445 #ifdef MBS_SUPPORT
446 if (MB_CUR_MAX > 1 && mb_properties)
447 free (mb_properties);
448 #endif /* MBS_SUPPORT */
449 return (size_t) -1;
451 success:
452 #ifdef MBS_SUPPORT
453 if (MB_CUR_MAX > 1 && mb_properties)
454 free (mb_properties);
455 #endif /* MBS_SUPPORT */
456 *match_size = end - beg;
457 return beg - buf;
460 static void
461 EGfree (void *compiled_pattern)
463 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
465 dfafree (&cregex->dfa);
466 free (cregex->patterns);
467 free (cregex->ckwset.trans);
468 free (cregex);
471 /* POSIX Basic Regular Expressions */
472 matcher_t matcher_grep =
474 Gcompile,
475 EGexecute,
476 EGfree
479 /* POSIX Extended Regular Expressions */
480 matcher_t matcher_egrep =
482 Ecompile,
483 EGexecute,
484 EGfree
487 /* AWK Regular Expressions */
488 matcher_t matcher_awk =
490 AWKcompile,
491 EGexecute,
492 EGfree