1 /* $NetBSD: search.c,v 1.2 2003/01/26 23:55:53 wiz Exp $ */
3 /* search.c - searching subroutines using dfa, kwset and regex for grep.
4 Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 /* Written August 1992 by Mike Haertel. */
26 #include <sys/types.h>
27 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC && defined HAVE_WCTYPE
28 /* We can handle multibyte string. */
45 #define NCHAR (UCHAR_MAX + 1)
47 /* For -w, we also consider _ to be word constituent. */
48 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
50 /* DFA compiled regexp. */
51 static struct dfa dfa
;
53 /* The Regex compiled patterns. */
54 static struct patterns
56 /* Regex compiled regexp. */
57 struct re_pattern_buffer regexbuf
;
58 struct re_registers regs
; /* This is here on account of a BRAIN-DEAD
59 Q@#%!# library interface in regex.c. */
62 struct patterns
*patterns
;
65 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
66 a list of strings, at least one of which is known to occur in
67 any string matching the regexp. */
70 /* Number of compiled fixed strings known to exactly match the regexp.
71 If kwsexec returns < kwset_exact_matches, then we don't need to
72 call the regexp matcher at all. */
73 static int kwset_exact_matches
;
75 #if defined(MBS_SUPPORT)
76 static char* check_multibyte_string
PARAMS ((char const *buf
, size_t size
));
78 static void kwsinit
PARAMS ((void));
79 static void kwsmusts
PARAMS ((void));
80 static void Gcompile
PARAMS ((char const *, size_t));
81 static void Ecompile
PARAMS ((char const *, size_t));
82 static size_t EGexecute
PARAMS ((char const *, size_t, size_t *, int ));
83 static void Fcompile
PARAMS ((char const *, size_t));
84 static size_t Fexecute
PARAMS ((char const *, size_t, size_t *, int));
85 static void Pcompile
PARAMS ((char const *, size_t ));
86 static size_t Pexecute
PARAMS ((char const *, size_t, size_t *, int));
89 dfaerror (char const *mesg
)
97 static char trans
[NCHAR
];
101 for (i
= 0; i
< NCHAR
; ++i
)
102 trans
[i
] = TOLOWER (i
);
104 if (!(kwset
= kwsalloc (match_icase
? trans
: (char *) 0)))
105 error (2, 0, _("memory exhausted"));
108 /* If the DFA turns out to have some set of fixed strings one of
109 which must occur in the match, then we build a kwset matcher
110 to find those strings, and thus quickly filter out impossible
115 struct dfamust
const *dm
;
121 /* First, we compile in the substrings known to be exact
122 matches. The kwset matcher will return the index
123 of the matching string that it chooses. */
124 for (dm
= dfa
.musts
; dm
; dm
= dm
->next
)
128 ++kwset_exact_matches
;
129 if ((err
= kwsincr (kwset
, dm
->must
, strlen (dm
->must
))) != 0)
132 /* Now, we compile the substrings that will require
133 the use of the regexp matcher. */
134 for (dm
= dfa
.musts
; dm
; dm
= dm
->next
)
138 if ((err
= kwsincr (kwset
, dm
->must
, strlen (dm
->must
))) != 0)
141 if ((err
= kwsprep (kwset
)) != 0)
147 /* This function allocate the array which correspond to "buf".
148 Then this check multibyte string and mark on the positions which
149 are not singlebyte character nor the first byte of a multibyte
150 character. Caller must free the array. */
152 check_multibyte_string(char const *buf
, size_t size
)
154 char *mb_properties
= malloc(size
);
157 memset(&cur_state
, 0, sizeof(mbstate_t));
158 memset(mb_properties
, 0, sizeof(char)*size
);
159 for (i
= 0; i
< size
;)
162 mbclen
= mbrlen(buf
+ i
, size
- i
, &cur_state
);
164 if (mbclen
== (size_t) -1 || mbclen
== (size_t) -2 || mbclen
== 0)
166 /* An invalid sequence, or a truncated multibyte character.
167 We treat it as a singlebyte character. */
170 mb_properties
[i
] = mbclen
;
174 return mb_properties
;
179 Gcompile (char const *pattern
, size_t size
)
184 char const *motif
= pattern
;
186 re_set_syntax (RE_SYNTAX_GREP
| RE_HAT_LISTS_NOT_NEWLINE
);
187 dfasyntax (RE_SYNTAX_GREP
| RE_HAT_LISTS_NOT_NEWLINE
, match_icase
, eolbyte
);
189 /* For GNU regex compiler we have to pass the patterns separately to detect
190 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
191 GNU regex should have raise a syntax error. The same for backref, where
192 the backref should have been local to each pattern. */
196 sep
= memchr (motif
, '\n', total
);
209 patterns
= realloc (patterns
, (pcount
+ 1) * sizeof (*patterns
));
210 if (patterns
== NULL
)
211 error (2, errno
, _("memory exhausted"));
213 patterns
[pcount
] = patterns0
;
215 if ((err
= re_compile_pattern (motif
, len
,
216 &(patterns
[pcount
].regexbuf
))) != 0)
221 } while (sep
&& total
!= 0);
223 /* In the match_words and match_lines cases, we use a different pattern
224 for the DFA matcher that will quickly throw out cases that won't work.
225 Then if DFA succeeds we do some hairy stuff using the regex matcher
226 to decide whether the match should really count. */
227 if (match_words
|| match_lines
)
229 /* In the whole-word case, we use the pattern:
230 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
231 In the whole-line case, we use the pattern:
232 ^\(userpattern\)$. */
234 static char const line_beg
[] = "^\\(";
235 static char const line_end
[] = "\\)$";
236 static char const word_beg
[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
237 static char const word_end
[] = "\\)\\([^[:alnum:]_]\\|$\\)";
238 char *n
= malloc (sizeof word_beg
- 1 + size
+ sizeof word_end
);
240 strcpy (n
, match_lines
? line_beg
: word_beg
);
242 memcpy (n
+ i
, pattern
, size
);
244 strcpy (n
+ i
, match_lines
? line_end
: word_end
);
250 dfacomp (pattern
, size
, &dfa
, 1);
255 Ecompile (char const *pattern
, size_t size
)
260 char const *motif
= pattern
;
262 if (strcmp (matcher
, "awk") == 0)
264 re_set_syntax (RE_SYNTAX_AWK
);
265 dfasyntax (RE_SYNTAX_AWK
, match_icase
, eolbyte
);
269 re_set_syntax (RE_SYNTAX_POSIX_EGREP
);
270 dfasyntax (RE_SYNTAX_POSIX_EGREP
, match_icase
, eolbyte
);
273 /* For GNU regex compiler we have to pass the patterns separately to detect
274 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
275 GNU regex should have raise a syntax error. The same for backref, where
276 the backref should have been local to each pattern. */
280 sep
= memchr (motif
, '\n', total
);
293 patterns
= realloc (patterns
, (pcount
+ 1) * sizeof (*patterns
));
294 if (patterns
== NULL
)
295 error (2, errno
, _("memory exhausted"));
296 patterns
[pcount
] = patterns0
;
298 if ((err
= re_compile_pattern (motif
, len
,
299 &(patterns
[pcount
].regexbuf
))) != 0)
304 } while (sep
&& total
!= 0);
306 /* In the match_words and match_lines cases, we use a different pattern
307 for the DFA matcher that will quickly throw out cases that won't work.
308 Then if DFA succeeds we do some hairy stuff using the regex matcher
309 to decide whether the match should really count. */
310 if (match_words
|| match_lines
)
312 /* In the whole-word case, we use the pattern:
313 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
314 In the whole-line case, we use the pattern:
317 static char const line_beg
[] = "^(";
318 static char const line_end
[] = ")$";
319 static char const word_beg
[] = "(^|[^[:alnum:]_])(";
320 static char const word_end
[] = ")([^[:alnum:]_]|$)";
321 char *n
= malloc (sizeof word_beg
- 1 + size
+ sizeof word_end
);
323 strcpy (n
, match_lines
? line_beg
: word_beg
);
325 memcpy (n
+ i
, pattern
, size
);
327 strcpy (n
+ i
, match_lines
? line_end
: word_end
);
333 dfacomp (pattern
, size
, &dfa
, 1);
338 EGexecute (char const *buf
, size_t size
, size_t *match_size
, int exact
)
340 register char const *buflim
, *beg
, *end
;
342 int backref
, start
, len
;
343 struct kwsmatch kwsm
;
346 char *mb_properties
= NULL
;
347 #endif /* MBS_SUPPORT */
350 if (MB_CUR_MAX
> 1 && kwset
)
351 mb_properties
= check_multibyte_string(buf
, size
);
352 #endif /* MBS_SUPPORT */
356 for (beg
= end
= buf
; end
< buflim
; beg
= end
)
362 /* Find a possible match using the KWset matcher. */
363 size_t offset
= kwsexec (kwset
, beg
, buflim
- beg
, &kwsm
);
364 if (offset
== (size_t) -1)
367 /* Narrow down to the line containing the candidate, and
368 run it through DFA. */
369 end
= memchr(beg
, eol
, buflim
- beg
);
372 if (MB_CUR_MAX
> 1 && mb_properties
[beg
- buf
] == 0)
375 while (beg
> buf
&& beg
[-1] != eol
)
377 if (kwsm
.index
< kwset_exact_matches
)
378 goto success_in_beg_and_end
;
379 if (dfaexec (&dfa
, beg
, end
- beg
, &backref
) == (size_t) -1)
384 /* No good fixed strings; start with DFA. */
385 size_t offset
= dfaexec (&dfa
, beg
, buflim
- beg
, &backref
);
386 if (offset
== (size_t) -1)
388 /* Narrow down to the line we've found. */
390 end
= memchr (beg
, eol
, buflim
- beg
);
392 while (beg
> buf
&& beg
[-1] != eol
)
395 /* Successful, no backreferences encountered! */
397 goto success_in_beg_and_end
;
402 /* If we've made it to this point, this means DFA has seen
403 a probable match, and we need to run it through Regex. */
404 for (i
= 0; i
< pcount
; i
++)
406 patterns
[i
].regexbuf
.not_eol
= 0;
407 if (0 <= (start
= re_search (&(patterns
[i
].regexbuf
), beg
,
409 end
- beg
- 1, &(patterns
[i
].regs
))))
411 len
= patterns
[i
].regs
.end
[0] - start
;
412 if (exact
&& !match_words
)
413 goto success_in_start_and_len
;
414 if ((!match_lines
&& !match_words
)
415 || (match_lines
&& len
== end
- beg
- 1))
416 goto success_in_beg_and_end
;
417 /* If -w, check if the match aligns with word boundaries.
418 We do this iteratively because:
419 (a) the line may contain more than one occurence of the
421 (b) Several alternatives in the pattern might be valid at a
422 given point, and we may need to consider a shorter one to
423 find a word boundary. */
427 if ((start
== 0 || !WCHAR ((unsigned char) beg
[start
- 1]))
428 && (len
== end
- beg
- 1
429 || !WCHAR ((unsigned char) beg
[start
+ len
])))
430 goto success_in_start_and_len
;
433 /* Try a shorter length anchored at the same place. */
435 patterns
[i
].regexbuf
.not_eol
= 1;
436 len
= re_match (&(patterns
[i
].regexbuf
), beg
,
438 &(patterns
[i
].regs
));
442 /* Try looking further on. */
443 if (start
== end
- beg
- 1)
446 patterns
[i
].regexbuf
.not_eol
= 0;
447 start
= re_search (&(patterns
[i
].regexbuf
), beg
,
449 start
, end
- beg
- 1 - start
,
450 &(patterns
[i
].regs
));
451 len
= patterns
[i
].regs
.end
[0] - start
;
455 } /* for Regex patterns. */
456 } /* for (beg = end ..) */
460 if (MB_CUR_MAX
> 1 && mb_properties
)
461 free (mb_properties
);
462 #endif /* MBS_SUPPORT */
465 success_in_beg_and_end
:
470 success_in_start_and_len
:
472 if (MB_CUR_MAX
> 1 && mb_properties
)
473 free (mb_properties
);
474 #endif /* MBS_SUPPORT */
480 Fcompile (char const *pattern
, size_t size
)
482 char const *beg
, *lim
, *err
;
488 for (lim
= beg
; lim
< pattern
+ size
&& *lim
!= '\n'; ++lim
)
490 if ((err
= kwsincr (kwset
, beg
, lim
- beg
)) != 0)
492 if (lim
< pattern
+ size
)
496 while (beg
< pattern
+ size
);
498 if ((err
= kwsprep (kwset
)) != 0)
503 Fexecute (char const *buf
, size_t size
, size_t *match_size
, int exact
)
505 register char const *beg
, *try, *end
;
508 struct kwsmatch kwsmatch
;
512 mb_properties
= check_multibyte_string (buf
, size
);
513 #endif /* MBS_SUPPORT */
515 for (beg
= buf
; beg
<= buf
+ size
; ++beg
)
517 size_t offset
= kwsexec (kwset
, beg
, buf
+ size
- beg
, &kwsmatch
);
518 if (offset
== (size_t) -1)
521 if (MB_CUR_MAX
> 1 && mb_properties
[offset
+beg
-buf
] == 0)
522 continue; /* It is a part of multibyte character. */
523 #endif /* MBS_SUPPORT */
525 len
= kwsmatch
.size
[0];
526 if (exact
&& !match_words
)
527 goto success_in_beg_and_len
;
530 if (beg
> buf
&& beg
[-1] != eol
)
532 if (beg
+ len
< buf
+ size
&& beg
[len
] != eol
)
536 else if (match_words
)
540 if ((offset
== 0 || !WCHAR ((unsigned char) beg
[-1]))
541 && (len
== end
- beg
- 1 || !WCHAR ((unsigned char) beg
[len
])))
544 /* Returns the whole line now we know there's a word match. */
547 /* Returns just this word match. */
548 goto success_in_beg_and_len
;
552 /* Try a shorter length anchored at the same place. */
554 offset
= kwsexec (kwset
, beg
, len
, &kwsmatch
);
556 break; /* Try a different anchor. */
559 len
= kwsmatch
.size
[0];
570 free (mb_properties
);
571 #endif /* MBS_SUPPORT */
575 end
= memchr (beg
+ len
, eol
, (buf
+ size
) - (beg
+ len
));
577 while (buf
< beg
&& beg
[-1] != eol
)
582 success_in_beg_and_len
:
586 free (mb_properties
);
587 #endif /* MBS_SUPPORT */
592 /* Compiled internal form of a Perl regular expression. */
595 /* Additional information about the pattern. */
596 static pcre_extra
*extra
;
600 Pcompile (char const *pattern
, size_t size
)
603 error (2, 0, _("The -P option is not supported"));
607 char *re
= xmalloc (4 * size
+ 7);
608 int flags
= PCRE_MULTILINE
| (match_icase
? PCRE_CASELESS
: 0);
609 char const *patlim
= pattern
+ size
;
614 /* FIXME: Remove this restriction. */
616 error (2, 0, _("The -P and -z options cannot be combined"));
625 /* The PCRE interface doesn't allow NUL bytes in the pattern, so
626 replace each NUL byte in the pattern with the four characters
627 "\000", removing a preceding backslash if there are an odd
628 number of backslashes before the NUL.
630 FIXME: This method does not work with some multibyte character
631 encodings, notably Shift-JIS, where a multibyte character can end
632 in a backslash byte. */
633 for (p
= pattern
; (pnul
= memchr (p
, '\0', patlim
- p
)); p
= pnul
+ 1)
635 memcpy (n
, p
, pnul
- p
);
637 for (p
= pnul
; pattern
< p
&& p
[-1] == '\\'; p
--)
644 memcpy (n
, p
, patlim
- p
);
652 cre
= pcre_compile (re
, flags
, &ep
, &e
, pcre_maketables ());
656 extra
= pcre_study (cre
, 0, &ep
);
665 Pexecute (char const *buf
, size_t size
, size_t *match_size
, int exact
)
671 /* This array must have at least two elements; everything after that
672 is just for performance improvement in pcre_exec. */
675 int e
= pcre_exec (cre
, extra
, buf
, size
, 0, 0,
676 sub
, sizeof sub
/ sizeof *sub
);
682 case PCRE_ERROR_NOMATCH
:
685 case PCRE_ERROR_NOMEMORY
:
686 error (2, 0, _("Memory exhausted"));
694 /* Narrow down to the line we've found. */
695 char const *beg
= buf
+ sub
[0];
696 char const *end
= buf
+ sub
[1];
697 char const *buflim
= buf
+ size
;
701 end
= memchr (end
, eol
, buflim
- end
);
703 while (buf
< beg
&& beg
[-1] != eol
)
707 *match_size
= end
- beg
;
713 struct matcher
const matchers
[] = {
714 { "default", Gcompile
, EGexecute
},
715 { "grep", Gcompile
, EGexecute
},
716 { "egrep", Ecompile
, EGexecute
},
717 { "awk", Ecompile
, EGexecute
},
718 { "fgrep", Fcompile
, Fexecute
},
719 { "perl", Pcompile
, Pexecute
},