Drop main() prototype. Syncs with NetBSD-8
[minix.git] / minix / usr.bin / grep / util.c
blobdbeb21a06fb62d3a5a9d9a63652c93de45d74288
1 /* $OpenBSD: util.c,v 1.48 2014/05/20 01:25:23 guenther Exp $ */
3 /*-
4 * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5 * All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
29 #include <sys/types.h>
30 #include <sys/stat.h>
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <zlib.h>
43 #include "grep.h"
46 * Process a file line by line...
49 static int linesqueued;
50 static int procline(str_t *l, int);
51 static int grep_search(fastgrep_t *, unsigned char *, size_t,
52 regmatch_t *pmatch);
53 #ifndef SMALL
54 static int grep_cmp(const unsigned char *, const unsigned char *, size_t);
55 static void grep_revstr(unsigned char *, int);
56 #endif
58 int
59 grep_tree(char **argv)
61 FTS *fts;
62 FTSENT *p;
63 int c, fts_flags;
65 c = 0;
67 fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
69 if (!(fts = fts_open(argv, fts_flags, NULL)))
70 err(2, NULL);
71 while ((p = fts_read(fts)) != NULL) {
72 switch (p->fts_info) {
73 case FTS_DNR:
74 break;
75 case FTS_ERR:
76 file_err = 1;
77 if(!sflag) {
78 errno = p->fts_errno;
79 warn("%s", p->fts_path);
81 break;
82 case FTS_DP:
83 break;
84 default:
85 c += procfile(p->fts_path);
86 break;
89 if (errno)
90 err(2, "fts_read");
92 return c;
95 int
96 procfile(const char *fn)
98 str_t ln;
99 file_t *f;
100 int c, t, z, nottext;
102 if (fn == NULL) {
103 fn = "(standard input)";
104 f = grep_fdopen(STDIN_FILENO, "r");
105 } else {
106 f = grep_open(fn, "r");
108 if (f == NULL) {
109 file_err = 1;
110 if (!sflag)
111 warn("%s", fn);
112 return 0;
115 nottext = grep_bin_file(f);
116 if (nottext && binbehave == BIN_FILE_SKIP) {
117 grep_close(f);
118 return 0;
121 ln.file = fn;
122 ln.line_no = 0;
123 ln.len = 0;
124 linesqueued = 0;
125 tail = 0;
126 ln.off = -1;
128 if (Bflag > 0)
129 initqueue();
130 for (c = 0; c == 0 || !(lflag || qflag); ) {
131 ln.off += ln.len + 1;
132 if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
133 break;
134 if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
135 --ln.len;
136 ln.line_no++;
138 z = tail;
140 if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
141 enqueue(&ln);
142 linesqueued++;
144 c += t;
146 if (Bflag > 0)
147 clearqueue();
148 grep_close(f);
150 if (cflag) {
151 if (!hflag)
152 printf("%s:", ln.file);
153 printf("%u\n", c);
155 if (lflag && c != 0)
156 printf("%s\n", fn);
157 if (Lflag && c == 0)
158 printf("%s\n", fn);
159 if (c && !cflag && !lflag && !Lflag &&
160 binbehave == BIN_FILE_BIN && nottext && !qflag)
161 printf("Binary file %s matches\n", fn);
163 return c;
168 * Process an individual line in a file. Return non-zero if it matches.
171 #define isword(x) (isalnum((unsigned char)x) || (x) == '_')
173 static int
174 procline(str_t *l, int nottext)
176 regmatch_t pmatch;
177 int c, i, r;
178 regoff_t offset;
180 /* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
181 * into regoff_t */
182 if (l->len > SSIZE_MAX) {
183 errx(2, "Line is too big to process");
186 c = 0;
187 i = 0;
188 if (matchall) {
189 c = 1;
190 goto print;
193 for (i = 0; i < patterns; i++) {
194 offset = 0;
195 redo:
196 if (fg_pattern[i].pattern) {
197 r = grep_search(&fg_pattern[i],
198 (unsigned char *)l->dat + offset, l->len - offset,
199 &pmatch);
200 pmatch.rm_so += offset;
201 pmatch.rm_eo += offset;
202 } else {
203 pmatch.rm_so = offset;
204 pmatch.rm_eo = l->len;
205 r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
207 if (r == 0 && xflag) {
208 if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
209 r = REG_NOMATCH;
211 if (r == 0) {
212 c = 1;
213 if (oflag && pmatch.rm_so != pmatch.rm_eo)
214 goto print;
215 break;
218 if (oflag)
219 return c;
220 print:
221 if (vflag)
222 c = !c;
224 if (c && binbehave == BIN_FILE_BIN && nottext)
225 return c; /* Binary file */
227 if ((tail > 0 || c) && !cflag && !qflag) {
228 if (c) {
229 if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
230 (Aflag || Bflag))
231 printf("--\n");
232 first = 1;
233 tail = Aflag;
234 if (Bflag > 0)
235 printqueue();
236 linesqueued = 0;
237 printline(l, ':', oflag ? &pmatch : NULL);
238 } else {
239 printline(l, '-', oflag ? &pmatch : NULL);
240 tail--;
243 if (oflag && !matchall) {
244 offset = pmatch.rm_eo;
245 goto redo;
247 return c;
250 #ifndef SMALL
251 void
252 fgrepcomp(fastgrep_t *fg, const unsigned char *pat)
254 int i;
256 /* Initialize. */
257 fg->patternLen = strlen((const char *)pat);
258 fg->bol = 0;
259 fg->eol = 0;
260 fg->wmatch = wflag;
261 fg->reversedSearch = 0;
264 * Make a copy and upper case it for later if in -i mode,
265 * else just copy the pointer.
267 if (iflag) {
268 fg->pattern = grep_malloc(fg->patternLen + 1);
269 for (i = 0; i < fg->patternLen; i++)
270 fg->pattern[i] = toupper(pat[i]);
271 fg->pattern[fg->patternLen] = '\0';
272 } else
273 fg->pattern = __UNCONST(pat); /* really const */
275 /* Preprocess pattern. */
276 for (i = 0; i <= UCHAR_MAX; i++)
277 fg->qsBc[i] = fg->patternLen;
278 for (i = 1; i < fg->patternLen; i++) {
279 fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
281 * If case is ignored, make the jump apply to both upper and
282 * lower cased characters. As the pattern is stored in upper
283 * case, apply the same to the lower case equivalents.
285 if (iflag)
286 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
289 #endif
292 * Returns: -1 on failure, 0 on success
295 fastcomp(fastgrep_t *fg, const char *pat)
297 #ifdef SMALL
298 return -1;
299 #else
300 int i;
301 int bol = 0;
302 int eol = 0;
303 int shiftPatternLen;
304 int hasDot = 0;
305 int firstHalfDot = -1;
306 int firstLastHalfDot = -1;
307 int lastHalfDot = 0;
309 /* Initialize. */
310 fg->patternLen = strlen(pat);
311 fg->bol = 0;
312 fg->eol = 0;
313 fg->wmatch = 0;
314 fg->reversedSearch = 0;
316 /* Remove end-of-line character ('$'). */
317 if (fg->patternLen > 0 && pat[fg->patternLen - 1] == '$') {
318 eol++;
319 fg->eol = 1;
320 fg->patternLen--;
323 /* Remove beginning-of-line character ('^'). */
324 if (pat[0] == '^') {
325 bol++;
326 fg->bol = 1;
327 fg->patternLen--;
330 /* Remove enclosing [[:<:]] and [[:>:]] (word match). */
331 if (wflag) {
332 /* basic re's use \( \), extended re's ( ) */
333 int extra = Eflag ? 1 : 2;
334 fg->patternLen -= 14 + 2 * extra;
335 fg->wmatch = 7 + extra;
336 } else if (fg->patternLen >= 14 &&
337 strncmp(pat + fg->bol, "[[:<:]]", 7) == 0 &&
338 strncmp(pat + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
339 fg->patternLen -= 14;
340 fg->wmatch = 7;
344 * Copy pattern minus '^' and '$' characters as well as word
345 * match character classes at the beginning and ending of the
346 * string respectively.
348 fg->pattern = grep_malloc(fg->patternLen + 1);
349 memcpy(fg->pattern, pat + bol + fg->wmatch, fg->patternLen);
350 fg->pattern[fg->patternLen] = '\0';
352 /* Look for ways to cheat...er...avoid the full regex engine. */
353 for (i = 0; i < fg->patternLen; i++)
355 switch (fg->pattern[i]) {
356 case '.':
357 hasDot = i;
358 if (i < fg->patternLen / 2) {
359 if (firstHalfDot < 0)
360 /* Closest dot to the beginning */
361 firstHalfDot = i;
362 } else {
363 /* Closest dot to the end of the pattern. */
364 lastHalfDot = i;
365 if (firstLastHalfDot < 0)
366 firstLastHalfDot = i;
368 break;
369 case '(': case ')':
370 case '{': case '}':
371 /* Special in BRE if preceded by '\\' */
372 case '?':
373 case '+':
374 case '|':
375 /* Not special in BRE. */
376 if (!Eflag)
377 goto nonspecial;
378 case '\\':
379 case '*':
380 case '[': case ']':
381 /* Free memory and let others know this is empty. */
382 free(fg->pattern);
383 fg->pattern = NULL;
384 return (-1);
385 default:
386 nonspecial:
387 if (iflag)
388 fg->pattern[i] = toupper(fg->pattern[i]);
389 break;
394 * Determine if a reverse search would be faster based on the placement
395 * of the dots.
397 if ((!(lflag || cflag)) && ((!(bol || eol)) &&
398 ((lastHalfDot) && ((firstHalfDot < 0) ||
399 ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
400 fg->reversedSearch = 1;
401 hasDot = fg->patternLen - (firstHalfDot < 0 ?
402 firstLastHalfDot : firstHalfDot) - 1;
403 grep_revstr(fg->pattern, fg->patternLen);
407 * Normal Quick Search would require a shift based on the position the
408 * next character after the comparison is within the pattern. With
409 * wildcards, the position of the last dot effects the maximum shift
410 * distance.
411 * The closer to the end the wild card is the slower the search. A
412 * reverse version of this algorithm would be useful for wildcards near
413 * the end of the string.
415 * Examples:
416 * Pattern Max shift
417 * ------- ---------
418 * this 5
419 * .his 4
420 * t.is 3
421 * th.s 2
422 * thi. 1
425 /* Adjust the shift based on location of the last dot ('.'). */
426 shiftPatternLen = fg->patternLen - hasDot;
428 /* Preprocess pattern. */
429 for (i = 0; i <= UCHAR_MAX; i++)
430 fg->qsBc[i] = shiftPatternLen;
431 for (i = hasDot + 1; i < fg->patternLen; i++) {
432 fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
434 * If case is ignored, make the jump apply to both upper and
435 * lower cased characters. As the pattern is stored in upper
436 * case, apply the same to the lower case equivalents.
438 if (iflag)
439 fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
443 * Put pattern back to normal after pre-processing to allow for easy
444 * comparisons later.
446 if (fg->reversedSearch)
447 grep_revstr(fg->pattern, fg->patternLen);
449 return (0);
450 #endif
454 * Word boundaries using regular expressions are defined as the point
455 * of transition from a non-word char to a word char, or vice versa.
456 * This means that grep -w +a and grep -w a+ never match anything,
457 * because they lack a starting or ending transition, but grep -w a+b
458 * does match a line containing a+b.
460 #define wmatch(d, l, s, e) \
461 ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
462 e > s && isword(d[s]) && isword(d[e-1]))
464 static int
465 grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen,
466 regmatch_t *pmatch)
468 #ifdef SMALL
469 return 0;
470 #else
471 regoff_t j;
472 int rtrnVal = REG_NOMATCH;
474 pmatch->rm_so = -1;
475 pmatch->rm_eo = -1;
477 /* No point in going farther if we do not have enough data. */
478 if (dataLen < (size_t)fg->patternLen)
479 return (rtrnVal);
481 /* Only try once at the beginning or ending of the line. */
482 if (fg->bol || fg->eol) {
483 /* Simple text comparison. */
484 /* Verify data is >= pattern length before searching on it. */
485 if (dataLen >= (size_t)fg->patternLen) {
486 /* Determine where in data to start search at. */
487 if (fg->eol)
488 j = dataLen - fg->patternLen;
489 else
490 j = 0;
491 if (!((fg->bol && fg->eol) &&
492 (dataLen != (size_t)fg->patternLen)))
493 if (grep_cmp(fg->pattern, data + j,
494 fg->patternLen) == -1) {
495 pmatch->rm_so = j;
496 pmatch->rm_eo = j + fg->patternLen;
497 if (!fg->wmatch || wmatch(data, dataLen,
498 pmatch->rm_so, pmatch->rm_eo))
499 rtrnVal = 0;
502 } else if (fg->reversedSearch) {
503 /* Quick Search algorithm. */
504 j = dataLen;
505 do {
506 if (grep_cmp(fg->pattern, data + j - fg->patternLen,
507 fg->patternLen) == -1) {
508 pmatch->rm_so = j - fg->patternLen;
509 pmatch->rm_eo = j;
510 if (!fg->wmatch || wmatch(data, dataLen,
511 pmatch->rm_so, pmatch->rm_eo)) {
512 rtrnVal = 0;
513 break;
516 /* Shift if within bounds, otherwise, we are done. */
517 if (j == fg->patternLen)
518 break;
519 j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
520 } while (j >= fg->patternLen);
521 } else {
522 /* Quick Search algorithm. */
523 j = 0;
524 do {
525 if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
526 pmatch->rm_so = j;
527 pmatch->rm_eo = j + fg->patternLen;
528 if (fg->patternLen == 0 || !fg->wmatch ||
529 wmatch(data, dataLen, pmatch->rm_so,
530 pmatch->rm_eo)) {
531 rtrnVal = 0;
532 break;
536 /* Shift if within bounds, otherwise, we are done. */
537 if (j + fg->patternLen == dataLen)
538 break;
539 else
540 j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
541 } while (j <= (dataLen - fg->patternLen));
544 return (rtrnVal);
545 #endif
549 void *
550 grep_malloc(size_t size)
552 void *ptr;
554 if ((ptr = malloc(size)) == NULL)
555 err(2, "malloc");
556 return ptr;
559 void *
560 grep_calloc(size_t nmemb, size_t size)
562 void *ptr;
564 if ((ptr = calloc(nmemb, size)) == NULL)
565 err(2, "calloc");
566 return ptr;
569 void *
570 grep_realloc(void *ptr, size_t size)
572 if ((ptr = realloc(ptr, size)) == NULL)
573 err(2, "realloc");
574 return ptr;
577 #ifndef SMALL
579 * Returns: i >= 0 on failure (position that it failed)
580 * -1 on success
582 static int
583 grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len)
585 size_t i;
587 for (i = 0; i < len; i++) {
588 if (((pat[i] == data[i]) || (!Fflag && pat[i] == '.'))
589 || (iflag && pat[i] == toupper(data[i])))
590 continue;
591 return (i);
594 return (-1);
597 static void
598 grep_revstr(unsigned char *str, int len)
600 int i;
601 char c;
603 for (i = 0; i < len / 2; i++) {
604 c = str[i];
605 str[i] = str[len - i - 1];
606 str[len - i - 1] = c;
609 #endif
611 void
612 printline(str_t *line, int sep, regmatch_t *pmatch)
614 int n;
616 n = 0;
617 if (!hflag) {
618 fputs(line->file, stdout);
619 ++n;
621 if (nflag) {
622 if (n)
623 putchar(sep);
624 printf("%d", line->line_no);
625 ++n;
627 if (bflag) {
628 if (n)
629 putchar(sep);
630 printf("%lld", (long long)line->off);
631 ++n;
633 if (n)
634 putchar(sep);
635 if (pmatch)
636 fwrite(line->dat + pmatch->rm_so,
637 pmatch->rm_eo - pmatch->rm_so, 1, stdout);
638 else
639 fwrite(line->dat, line->len, 1, stdout);
640 putchar('\n');