Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / grep / src / search.c
blobfbdafc5043caade6fd801dc46fab57b69c7fd5a8
1 /* $NetBSD: search.c,v 1.2 2003/01/26 23:55:53 wiz Exp $ */
3 /* search.c - searching subroutines using dfa, kwset and regex for grep.
4 Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
21 /* Written August 1992 by Mike Haertel. */
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26 #include <sys/types.h>
27 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC && defined HAVE_WCTYPE
28 /* We can handle multibyte string. */
29 # define MBS_SUPPORT
30 # include <wchar.h>
31 # include <wctype.h>
32 #endif
34 #include "system.h"
35 #include "grep.h"
36 #include "regex.h"
37 #include "dfa.h"
38 #include "kwset.h"
39 #include "error.h"
40 #include "xalloc.h"
41 #ifdef HAVE_LIBPCRE
42 # include <pcre.h>
43 #endif
45 #define NCHAR (UCHAR_MAX + 1)
47 /* For -w, we also consider _ to be word constituent. */
48 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
50 /* DFA compiled regexp. */
51 static struct dfa dfa;
53 /* The Regex compiled patterns. */
54 static struct patterns
56 /* Regex compiled regexp. */
57 struct re_pattern_buffer regexbuf;
58 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
59 Q@#%!# library interface in regex.c. */
60 } patterns0;
62 struct patterns *patterns;
63 size_t pcount;
65 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
66 a list of strings, at least one of which is known to occur in
67 any string matching the regexp. */
68 static kwset_t kwset;
70 /* Number of compiled fixed strings known to exactly match the regexp.
71 If kwsexec returns < kwset_exact_matches, then we don't need to
72 call the regexp matcher at all. */
73 static int kwset_exact_matches;
75 #if defined(MBS_SUPPORT)
76 static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
77 #endif
78 static void kwsinit PARAMS ((void));
79 static void kwsmusts PARAMS ((void));
80 static void Gcompile PARAMS ((char const *, size_t));
81 static void Ecompile PARAMS ((char const *, size_t));
82 static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
83 static void Fcompile PARAMS ((char const *, size_t));
84 static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
85 static void Pcompile PARAMS ((char const *, size_t ));
86 static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
88 void
89 dfaerror (char const *mesg)
91 error (2, 0, mesg);
94 static void
95 kwsinit (void)
97 static char trans[NCHAR];
98 int i;
100 if (match_icase)
101 for (i = 0; i < NCHAR; ++i)
102 trans[i] = TOLOWER (i);
104 if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
105 error (2, 0, _("memory exhausted"));
108 /* If the DFA turns out to have some set of fixed strings one of
109 which must occur in the match, then we build a kwset matcher
110 to find those strings, and thus quickly filter out impossible
111 matches. */
112 static void
113 kwsmusts (void)
115 struct dfamust const *dm;
116 char const *err;
118 if (dfa.musts)
120 kwsinit ();
121 /* First, we compile in the substrings known to be exact
122 matches. The kwset matcher will return the index
123 of the matching string that it chooses. */
124 for (dm = dfa.musts; dm; dm = dm->next)
126 if (!dm->exact)
127 continue;
128 ++kwset_exact_matches;
129 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
130 error (2, 0, err);
132 /* Now, we compile the substrings that will require
133 the use of the regexp matcher. */
134 for (dm = dfa.musts; dm; dm = dm->next)
136 if (dm->exact)
137 continue;
138 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
139 error (2, 0, err);
141 if ((err = kwsprep (kwset)) != 0)
142 error (2, 0, err);
146 #ifdef MBS_SUPPORT
147 /* This function allocate the array which correspond to "buf".
148 Then this check multibyte string and mark on the positions which
149 are not singlebyte character nor the first byte of a multibyte
150 character. Caller must free the array. */
151 static char*
152 check_multibyte_string(char const *buf, size_t size)
154 char *mb_properties = malloc(size);
155 mbstate_t cur_state;
156 int i;
157 memset(&cur_state, 0, sizeof(mbstate_t));
158 memset(mb_properties, 0, sizeof(char)*size);
159 for (i = 0; i < size ;)
161 size_t mbclen;
162 mbclen = mbrlen(buf + i, size - i, &cur_state);
164 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
166 /* An invalid sequence, or a truncated multibyte character.
167 We treat it as a singlebyte character. */
168 mbclen = 1;
170 mb_properties[i] = mbclen;
171 i += mbclen;
174 return mb_properties;
176 #endif
178 static void
179 Gcompile (char const *pattern, size_t size)
181 const char *err;
182 char const *sep;
183 size_t total = size;
184 char const *motif = pattern;
186 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
187 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
189 /* For GNU regex compiler we have to pass the patterns separately to detect
190 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
191 GNU regex should have raise a syntax error. The same for backref, where
192 the backref should have been local to each pattern. */
195 size_t len;
196 sep = memchr (motif, '\n', total);
197 if (sep)
199 len = sep - motif;
200 sep++;
201 total -= (len + 1);
203 else
205 len = total;
206 total = 0;
209 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
210 if (patterns == NULL)
211 error (2, errno, _("memory exhausted"));
213 patterns[pcount] = patterns0;
215 if ((err = re_compile_pattern (motif, len,
216 &(patterns[pcount].regexbuf))) != 0)
217 error (2, 0, err);
218 pcount++;
220 motif = sep;
221 } while (sep && total != 0);
223 /* In the match_words and match_lines cases, we use a different pattern
224 for the DFA matcher that will quickly throw out cases that won't work.
225 Then if DFA succeeds we do some hairy stuff using the regex matcher
226 to decide whether the match should really count. */
227 if (match_words || match_lines)
229 /* In the whole-word case, we use the pattern:
230 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
231 In the whole-line case, we use the pattern:
232 ^\(userpattern\)$. */
234 static char const line_beg[] = "^\\(";
235 static char const line_end[] = "\\)$";
236 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
237 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
238 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
239 size_t i;
240 strcpy (n, match_lines ? line_beg : word_beg);
241 i = strlen (n);
242 memcpy (n + i, pattern, size);
243 i += size;
244 strcpy (n + i, match_lines ? line_end : word_end);
245 i += strlen (n + i);
246 pattern = n;
247 size = i;
250 dfacomp (pattern, size, &dfa, 1);
251 kwsmusts ();
254 static void
255 Ecompile (char const *pattern, size_t size)
257 const char *err;
258 const char *sep;
259 size_t total = size;
260 char const *motif = pattern;
262 if (strcmp (matcher, "awk") == 0)
264 re_set_syntax (RE_SYNTAX_AWK);
265 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
267 else
269 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
270 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
273 /* For GNU regex compiler we have to pass the patterns separately to detect
274 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
275 GNU regex should have raise a syntax error. The same for backref, where
276 the backref should have been local to each pattern. */
279 size_t len;
280 sep = memchr (motif, '\n', total);
281 if (sep)
283 len = sep - motif;
284 sep++;
285 total -= (len + 1);
287 else
289 len = total;
290 total = 0;
293 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
294 if (patterns == NULL)
295 error (2, errno, _("memory exhausted"));
296 patterns[pcount] = patterns0;
298 if ((err = re_compile_pattern (motif, len,
299 &(patterns[pcount].regexbuf))) != 0)
300 error (2, 0, err);
301 pcount++;
303 motif = sep;
304 } while (sep && total != 0);
306 /* In the match_words and match_lines cases, we use a different pattern
307 for the DFA matcher that will quickly throw out cases that won't work.
308 Then if DFA succeeds we do some hairy stuff using the regex matcher
309 to decide whether the match should really count. */
310 if (match_words || match_lines)
312 /* In the whole-word case, we use the pattern:
313 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
314 In the whole-line case, we use the pattern:
315 ^(userpattern)$. */
317 static char const line_beg[] = "^(";
318 static char const line_end[] = ")$";
319 static char const word_beg[] = "(^|[^[:alnum:]_])(";
320 static char const word_end[] = ")([^[:alnum:]_]|$)";
321 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
322 size_t i;
323 strcpy (n, match_lines ? line_beg : word_beg);
324 i = strlen(n);
325 memcpy (n + i, pattern, size);
326 i += size;
327 strcpy (n + i, match_lines ? line_end : word_end);
328 i += strlen (n + i);
329 pattern = n;
330 size = i;
333 dfacomp (pattern, size, &dfa, 1);
334 kwsmusts ();
337 static size_t
338 EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
340 register char const *buflim, *beg, *end;
341 char eol = eolbyte;
342 int backref, start, len;
343 struct kwsmatch kwsm;
344 size_t i;
345 #ifdef MBS_SUPPORT
346 char *mb_properties = NULL;
347 #endif /* MBS_SUPPORT */
349 #ifdef MBS_SUPPORT
350 if (MB_CUR_MAX > 1 && kwset)
351 mb_properties = check_multibyte_string(buf, size);
352 #endif /* MBS_SUPPORT */
354 buflim = buf + size;
356 for (beg = end = buf; end < buflim; beg = end)
358 if (!exact)
360 if (kwset)
362 /* Find a possible match using the KWset matcher. */
363 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
364 if (offset == (size_t) -1)
365 goto failure;
366 beg += offset;
367 /* Narrow down to the line containing the candidate, and
368 run it through DFA. */
369 end = memchr(beg, eol, buflim - beg);
370 end++;
371 #ifdef MBS_SUPPORT
372 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
373 continue;
374 #endif
375 while (beg > buf && beg[-1] != eol)
376 --beg;
377 if (kwsm.index < kwset_exact_matches)
378 goto success_in_beg_and_end;
379 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
380 continue;
382 else
384 /* No good fixed strings; start with DFA. */
385 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
386 if (offset == (size_t) -1)
387 break;
388 /* Narrow down to the line we've found. */
389 beg += offset;
390 end = memchr (beg, eol, buflim - beg);
391 end++;
392 while (beg > buf && beg[-1] != eol)
393 --beg;
395 /* Successful, no backreferences encountered! */
396 if (!backref)
397 goto success_in_beg_and_end;
399 else
400 end = beg + size;
402 /* If we've made it to this point, this means DFA has seen
403 a probable match, and we need to run it through Regex. */
404 for (i = 0; i < pcount; i++)
406 patterns[i].regexbuf.not_eol = 0;
407 if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
408 end - beg - 1, 0,
409 end - beg - 1, &(patterns[i].regs))))
411 len = patterns[i].regs.end[0] - start;
412 if (exact && !match_words)
413 goto success_in_start_and_len;
414 if ((!match_lines && !match_words)
415 || (match_lines && len == end - beg - 1))
416 goto success_in_beg_and_end;
417 /* If -w, check if the match aligns with word boundaries.
418 We do this iteratively because:
419 (a) the line may contain more than one occurence of the
420 pattern, and
421 (b) Several alternatives in the pattern might be valid at a
422 given point, and we may need to consider a shorter one to
423 find a word boundary. */
424 if (match_words)
425 while (start >= 0)
427 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
428 && (len == end - beg - 1
429 || !WCHAR ((unsigned char) beg[start + len])))
430 goto success_in_start_and_len;
431 if (len > 0)
433 /* Try a shorter length anchored at the same place. */
434 --len;
435 patterns[i].regexbuf.not_eol = 1;
436 len = re_match (&(patterns[i].regexbuf), beg,
437 start + len, start,
438 &(patterns[i].regs));
440 if (len <= 0)
442 /* Try looking further on. */
443 if (start == end - beg - 1)
444 break;
445 ++start;
446 patterns[i].regexbuf.not_eol = 0;
447 start = re_search (&(patterns[i].regexbuf), beg,
448 end - beg - 1,
449 start, end - beg - 1 - start,
450 &(patterns[i].regs));
451 len = patterns[i].regs.end[0] - start;
455 } /* for Regex patterns. */
456 } /* for (beg = end ..) */
458 failure:
459 #ifdef MBS_SUPPORT
460 if (MB_CUR_MAX > 1 && mb_properties)
461 free (mb_properties);
462 #endif /* MBS_SUPPORT */
463 return (size_t) -1;
465 success_in_beg_and_end:
466 len = end - beg;
467 start = beg - buf;
468 /* FALLTHROUGH */
470 success_in_start_and_len:
471 #ifdef MBS_SUPPORT
472 if (MB_CUR_MAX > 1 && mb_properties)
473 free (mb_properties);
474 #endif /* MBS_SUPPORT */
475 *match_size = len;
476 return start;
479 static void
480 Fcompile (char const *pattern, size_t size)
482 char const *beg, *lim, *err;
484 kwsinit ();
485 beg = pattern;
488 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
490 if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
491 error (2, 0, err);
492 if (lim < pattern + size)
493 ++lim;
494 beg = lim;
496 while (beg < pattern + size);
498 if ((err = kwsprep (kwset)) != 0)
499 error (2, 0, err);
502 static size_t
503 Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
505 register char const *beg, *try, *end;
506 register size_t len;
507 char eol = eolbyte;
508 struct kwsmatch kwsmatch;
509 #ifdef MBS_SUPPORT
510 char *mb_properties;
511 if (MB_CUR_MAX > 1)
512 mb_properties = check_multibyte_string (buf, size);
513 #endif /* MBS_SUPPORT */
515 for (beg = buf; beg <= buf + size; ++beg)
517 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
518 if (offset == (size_t) -1)
519 goto failure;
520 #ifdef MBS_SUPPORT
521 if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
522 continue; /* It is a part of multibyte character. */
523 #endif /* MBS_SUPPORT */
524 beg += offset;
525 len = kwsmatch.size[0];
526 if (exact && !match_words)
527 goto success_in_beg_and_len;
528 if (match_lines)
530 if (beg > buf && beg[-1] != eol)
531 continue;
532 if (beg + len < buf + size && beg[len] != eol)
533 continue;
534 goto success;
536 else if (match_words)
538 while (offset >= 0)
540 if ((offset == 0 || !WCHAR ((unsigned char) beg[-1]))
541 && (len == end - beg - 1 || !WCHAR ((unsigned char) beg[len])))
543 if (!exact)
544 /* Returns the whole line now we know there's a word match. */
545 goto success;
546 else
547 /* Returns just this word match. */
548 goto success_in_beg_and_len;
550 if (len > 0)
552 /* Try a shorter length anchored at the same place. */
553 --len;
554 offset = kwsexec (kwset, beg, len, &kwsmatch);
555 if (offset == -1) {
556 break; /* Try a different anchor. */
558 beg += offset;
559 len = kwsmatch.size[0];
563 else
564 goto success;
567 failure:
568 #ifdef MBS_SUPPORT
569 if (MB_CUR_MAX > 1)
570 free (mb_properties);
571 #endif /* MBS_SUPPORT */
572 return -1;
574 success:
575 end = memchr (beg + len, eol, (buf + size) - (beg + len));
576 end++;
577 while (buf < beg && beg[-1] != eol)
578 --beg;
579 len = end - beg;
580 /* FALLTHROUGH */
582 success_in_beg_and_len:
583 *match_size = len;
584 #ifdef MBS_SUPPORT
585 if (MB_CUR_MAX > 1)
586 free (mb_properties);
587 #endif /* MBS_SUPPORT */
588 return beg - buf;
591 #if HAVE_LIBPCRE
592 /* Compiled internal form of a Perl regular expression. */
593 static pcre *cre;
595 /* Additional information about the pattern. */
596 static pcre_extra *extra;
597 #endif
599 static void
600 Pcompile (char const *pattern, size_t size)
602 #if !HAVE_LIBPCRE
603 error (2, 0, _("The -P option is not supported"));
604 #else
605 int e;
606 char const *ep;
607 char *re = xmalloc (4 * size + 7);
608 int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
609 char const *patlim = pattern + size;
610 char *n = re;
611 char const *p;
612 char const *pnul;
614 /* FIXME: Remove this restriction. */
615 if (eolbyte != '\n')
616 error (2, 0, _("The -P and -z options cannot be combined"));
618 *n = '\0';
619 if (match_lines)
620 strcpy (n, "^(");
621 if (match_words)
622 strcpy (n, "\\b(");
623 n += strlen (n);
625 /* The PCRE interface doesn't allow NUL bytes in the pattern, so
626 replace each NUL byte in the pattern with the four characters
627 "\000", removing a preceding backslash if there are an odd
628 number of backslashes before the NUL.
630 FIXME: This method does not work with some multibyte character
631 encodings, notably Shift-JIS, where a multibyte character can end
632 in a backslash byte. */
633 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
635 memcpy (n, p, pnul - p);
636 n += pnul - p;
637 for (p = pnul; pattern < p && p[-1] == '\\'; p--)
638 continue;
639 n -= (pnul - p) & 1;
640 strcpy (n, "\\000");
641 n += 4;
644 memcpy (n, p, patlim - p);
645 n += patlim - p;
646 *n = '\0';
647 if (match_words)
648 strcpy (n, ")\\b");
649 if (match_lines)
650 strcpy (n, ")$");
652 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
653 if (!cre)
654 error (2, 0, ep);
656 extra = pcre_study (cre, 0, &ep);
657 if (ep)
658 error (2, 0, ep);
660 free (re);
661 #endif
664 static size_t
665 Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
667 #if !HAVE_LIBPCRE
668 abort ();
669 return -1;
670 #else
671 /* This array must have at least two elements; everything after that
672 is just for performance improvement in pcre_exec. */
673 int sub[300];
675 int e = pcre_exec (cre, extra, buf, size, 0, 0,
676 sub, sizeof sub / sizeof *sub);
678 if (e <= 0)
680 switch (e)
682 case PCRE_ERROR_NOMATCH:
683 return -1;
685 case PCRE_ERROR_NOMEMORY:
686 error (2, 0, _("Memory exhausted"));
688 default:
689 abort ();
692 else
694 /* Narrow down to the line we've found. */
695 char const *beg = buf + sub[0];
696 char const *end = buf + sub[1];
697 char const *buflim = buf + size;
698 char eol = eolbyte;
699 if (!exact)
701 end = memchr (end, eol, buflim - end);
702 end++;
703 while (buf < beg && beg[-1] != eol)
704 --beg;
707 *match_size = end - beg;
708 return beg - buf;
710 #endif
713 struct matcher const matchers[] = {
714 { "default", Gcompile, EGexecute },
715 { "grep", Gcompile, EGexecute },
716 { "egrep", Ecompile, EGexecute },
717 { "awk", Ecompile, EGexecute },
718 { "fgrep", Fcompile, Fexecute },
719 { "perl", Pcompile, Pexecute },
720 { "", 0, 0 },