(main): Stat all non-`-' input file files (and fail if a
[coreutils.git] / src / cut.c
blob8b2ec239a4bfaad9f068ee4466b417dcc2cc95e5
1 /* cut - remove parts of lines of files
2 Copyright (C) 1984, 1997, 1998 by David M. Ihnat
4 This program is a total rewrite of the Bell Laboratories Unix(Tm)
5 command of the same name, as of System V. It contains no proprietary
6 code, and therefore may be used without violation of any proprietary
7 agreements whatsoever. However, you will notice that the program is
8 copyrighted by me. This is to assure the program does *not* fall
9 into the public domain. Thus, I may specify just what I am now:
10 This program may be freely copied and distributed, provided this notice
11 remains; it may not be sold for profit without express written consent of
12 the author.
13 Please note that I recreated the behavior of the Unix(Tm) 'cut' command
14 as faithfully as possible; however, I haven't run a full set of regression
15 tests. Thus, the user of this program accepts full responsibility for any
16 effects or loss; in particular, the author is not responsible for any losses,
17 explicit or incidental, that may be incurred through use of this program.
19 I ask that any bugs (and, if possible, fixes) be reported to me when
20 possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us
22 POSIX changes, bug fixes, long-named options, and cleanup
23 by David MacKenzie <djm@gnu.ai.mit.edu>.
25 Rewrite cut_fields and cut_bytes -- Jim Meyering (meyering@comco.com).
27 Options:
28 --bytes=byte-list
29 -b byte-list Print only the bytes in positions listed
30 in BYTE-LIST.
31 Tabs and backspaces are treated like any
32 other character; they take up 1 byte.
34 --characters=character-list
35 -c character-list Print only characters in positions listed
36 in CHARACTER-LIST.
37 The same as -b for now, but
38 internationalization will change that.
39 Tabs and backspaces are treated like any
40 other character; they take up 1 character.
42 --fields=field-list
43 -f field-list Print only the fields listed in FIELD-LIST.
44 Fields are separated by a TAB by default.
46 --delimiter=delim
47 -d delim For -f, fields are separated by the first
48 character in DELIM instead of TAB.
50 -n Do not split multibyte chars (no-op for now).
52 --only-delimited
53 -s For -f, do not print lines that do not contain
54 the field separator character.
56 The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers
57 or ranges separated by commas. The first byte, character, and field
58 are numbered 1.
60 A FILE of `-' means standard input. */
62 #include <config.h>
64 #include <stdio.h>
65 #include <assert.h>
66 #include <getopt.h>
67 #include <sys/types.h>
68 #include "system.h"
69 #include "error.h"
71 #define FATAL_ERROR(s) \
72 do \
73 { \
74 error (0, 0, (s)); \
75 usage (2); \
76 } \
77 while (0)
79 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
80 space if necessary. Update local variable N_RP. When allocating,
81 update global variable N_RP_ALLOCATED. */
83 #define ADD_RANGE_PAIR(rp, low, high) \
84 do \
85 { \
86 if (n_rp >= n_rp_allocated) \
87 { \
88 n_rp_allocated *= 2; \
89 (rp) = (struct range_pair *) xrealloc ((char *) (rp), \
90 n_rp_allocated * sizeof (*(rp))); \
91 } \
92 rp[n_rp].lo = (low); \
93 rp[n_rp].hi = (high); \
94 ++n_rp; \
95 } \
96 while (0)
98 struct range_pair
100 unsigned int lo;
101 unsigned int hi;
104 /* This buffer is used to support the semantics of the -s option
105 (or lack of same) when the specified field list includes (does
106 not include) the first field. In both of those cases, the entire
107 first field must be read into this buffer to determine whether it
108 is followed by a delimiter or a newline before any of it may be
109 output. Otherwise, cut_fields can do the job without using this
110 buffer. */
111 static char *field_1_buffer;
113 /* The number of bytes allocated for FIELD_1_BUFFER. */
114 static int field_1_bufsize;
116 /* The largest field or byte index used as an endpoint of a closed
117 or degenerate range specification; this doesn't include the starting
118 index of right-open-ended ranges. For example, with either range spec
119 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
120 static unsigned int max_range_endpoint;
122 /* If nonzero, this is the index of the first field in a range that goes
123 to end of line. */
124 static unsigned int eol_range_start;
126 /* In byte mode, which bytes to output.
127 In field mode, which DELIM-separated fields to output.
128 Both bytes and fields are numbered starting with 1,
129 so the zeroth element of this array is unused.
130 A field or byte K has been selected if
131 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
132 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
133 static int *printable_field;
135 enum operating_mode
137 undefined_mode,
139 /* Output characters that are in the given bytes. */
140 byte_mode,
142 /* Output the given delimeter-separated fields. */
143 field_mode
146 /* The name this program was run with. */
147 char *program_name;
149 static enum operating_mode operating_mode;
151 /* If nonzero do not output lines containing no delimeter characters.
152 Otherwise, all such lines are printed. This option is valid only
153 with field mode. */
154 static int suppress_non_delimited;
156 /* The delimeter character for field mode. */
157 static int delim;
159 /* Nonzero if we have ever read standard input. */
160 static int have_read_stdin;
162 /* If nonzero, display usage information and exit. */
163 static int show_help;
165 /* If nonzero, print the version on standard output then exit. */
166 static int show_version;
168 static struct option const longopts[] =
170 {"bytes", required_argument, 0, 'b'},
171 {"characters", required_argument, 0, 'c'},
172 {"fields", required_argument, 0, 'f'},
173 {"delimiter", required_argument, 0, 'd'},
174 {"only-delimited", no_argument, 0, 's'},
175 {"help", no_argument, &show_help, 1},
176 {"version", no_argument, &show_version, 1},
177 {0, 0, 0, 0}
180 static void
181 usage (int status)
183 if (status != 0)
184 fprintf (stderr, _("Try `%s --help' for more information.\n"),
185 program_name);
186 else
188 printf (_("\
189 Usage: %s [OPTION]... [FILE]...\n\
191 program_name);
192 printf (_("\
193 Print selected parts of lines from each FILE to standard output.\n\
195 -b, --bytes=LIST output only these bytes\n\
196 -c, --characters=LIST output only these characters\n\
197 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
198 -f, --fields=LIST output only these fields\n\
199 -n (ignored)\n\
200 -s, --only-delimited do not print lines not containing delimiters\n\
201 --help display this help and exit\n\
202 --version output version information and exit\n\
204 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
205 range, or many ranges separated by commas. Each range is one of:\n\
207 N N'th byte, character or field, counted from 1\n\
208 N- from N'th byte, character or field, to end of line\n\
209 N-M from N'th to M'th (included) byte, character or field\n\
210 -M from first to M'th (included) byte, character or field\n\
212 With no FILE, or when FILE is -, read standard input.\n\
213 "));
214 puts (_("\nReport bugs to <textutils-bugs@gnu.org>."));
216 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
219 /* The following function was copied from getline.c, but with these changes:
220 - Read up to and including a newline or TERMINATOR, whichever comes first.
221 The original does not treat newline specially.
222 - Remove unused argument, OFFSET.
223 - Use xmalloc and xrealloc instead of malloc and realloc.
224 - Declare this function static. */
226 /* Always add at least this many bytes when extending the buffer. */
227 #define MIN_CHUNK 64
229 /* Read up to (and including) a newline or TERMINATOR from STREAM into
230 *LINEPTR (and null-terminate it). *LINEPTR is a pointer returned from
231 xmalloc (or NULL), pointing to *N characters of space. It is
232 xrealloc'd as necessary. Return the number of characters read (not
233 including the null terminator), or -1 on error or EOF. */
235 static int
236 getstr (char **lineptr, int *n, FILE *stream, char terminator)
238 int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
239 char *read_pos; /* Where we're reading into *LINEPTR. */
241 if (!lineptr || !n || !stream)
242 return -1;
244 if (!*lineptr)
246 *n = MIN_CHUNK;
247 *lineptr = (char *) xmalloc (*n);
248 if (!*lineptr)
249 return -1;
252 nchars_avail = *n;
253 read_pos = *lineptr;
255 for (;;)
257 register int c = getc (stream);
259 /* We always want at least one char left in the buffer, since we
260 always (unless we get an error while reading the first char)
261 NUL-terminate the line buffer. */
263 assert (*n - nchars_avail == read_pos - *lineptr);
264 if (nchars_avail < 1)
266 if (*n > MIN_CHUNK)
267 *n *= 2;
268 else
269 *n += MIN_CHUNK;
271 nchars_avail = *n + *lineptr - read_pos;
272 *lineptr = xrealloc (*lineptr, *n);
273 if (!*lineptr)
274 return -1;
275 read_pos = *n - nchars_avail + *lineptr;
276 assert (*n - nchars_avail == read_pos - *lineptr);
279 if (feof (stream) || ferror (stream))
281 /* Return partial line, if any. */
282 if (read_pos == *lineptr)
283 return -1;
284 else
285 break;
288 *read_pos++ = c;
289 nchars_avail--;
291 if (c == terminator || c == '\n')
292 /* Return the line. */
293 break;
296 /* Done - NUL terminate and return the number of chars read. */
297 *read_pos = '\0';
299 return read_pos - *lineptr;
302 static int
303 print_kth (unsigned int k)
305 return ((0 < eol_range_start && eol_range_start <= k)
306 || (k <= max_range_endpoint && printable_field[k]));
309 /* Given the list of field or byte range specifications FIELDSTR, set
310 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
311 array. If there is a right-open-ended range, set EOL_RANGE_START
312 to its starting index. FIELDSTR should be composed of one or more
313 numbers or ranges of numbers, separated by blanks or commas.
314 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
315 through end of line. Return nonzero if FIELDSTR contains at least
316 one field specification, zero otherwise. */
318 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
319 of some huge input file? This function shouldn't have to alloate a table
320 of a million ints just so we can test every field < 10^6 with an array
321 dereference. Instead, consider using a dynamic hash table. It would be
322 simpler and nearly as good a solution to use a 32K x 4-byte table with
323 one bit per field index instead of a whole `int' per index. */
325 static int
326 set_fields (const char *fieldstr)
328 unsigned int initial = 1; /* Value of first number in a range. */
329 unsigned int value = 0; /* If nonzero, a number being accumulated. */
330 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
331 int field_found = 0; /* Non-zero if at least one field spec
332 has been processed. */
334 struct range_pair *rp;
335 unsigned int n_rp;
336 unsigned int n_rp_allocated;
337 unsigned int i;
339 n_rp = 0;
340 n_rp_allocated = 16;
341 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
343 /* Collect and store in RP the range end points.
344 It also sets EOL_RANGE_START if appropriate. */
346 for (;;)
348 if (*fieldstr == '-')
350 /* Starting a range. */
351 if (dash_found)
352 FATAL_ERROR (_("invalid byte or field list"));
353 dash_found++;
354 fieldstr++;
356 if (value)
358 initial = value;
359 value = 0;
361 else
362 initial = 1;
364 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
366 /* Ending the string, or this field/byte sublist. */
367 if (dash_found)
369 dash_found = 0;
371 /* A range. Possibilites: -n, m-n, n-.
372 In any case, `initial' contains the start of the range. */
373 if (value == 0)
375 /* `n-'. From `initial' to end of line. */
376 eol_range_start = initial;
377 field_found = 1;
379 else
381 /* `m-n' or `-n' (1-n). */
382 if (value < initial)
383 FATAL_ERROR (_("invalid byte or field list"));
385 /* Is there already a range going to end of line? */
386 if (eol_range_start != 0)
388 /* Yes. Is the new sequence already contained
389 in the old one? If so, no processing is
390 necessary. */
391 if (initial < eol_range_start)
393 /* No, the new sequence starts before the
394 old. Does the old range going to end of line
395 extend into the new range? */
396 if (value + 1 >= eol_range_start)
398 /* Yes. Simply move the end of line marker. */
399 eol_range_start = initial;
401 else
403 /* No. A simple range, before and disjoint from
404 the range going to end of line. Fill it. */
405 ADD_RANGE_PAIR (rp, initial, value);
408 /* In any case, some fields were selected. */
409 field_found = 1;
412 else
414 /* There is no range going to end of line. */
415 ADD_RANGE_PAIR (rp, initial, value);
416 field_found = 1;
418 value = 0;
421 else if (value != 0)
423 /* A simple field number, not a range. */
424 ADD_RANGE_PAIR (rp, value, value);
425 value = 0;
426 field_found = 1;
429 if (*fieldstr == '\0')
431 break;
434 fieldstr++;
436 else if (ISDIGIT (*fieldstr))
438 /* FIXME: detect overflow? */
439 value = 10 * value + *fieldstr - '0';
440 fieldstr++;
442 else
443 FATAL_ERROR (_("invalid byte or field list"));
446 max_range_endpoint = 0;
447 for (i = 0; i < n_rp; i++)
449 if (rp[i].hi > max_range_endpoint)
450 max_range_endpoint = rp[i].hi;
453 /* Allocate an array large enough so that it may be indexed by
454 the field numbers corresponding to all finite ranges
455 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
457 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
458 memset (printable_field, 0, (max_range_endpoint + 1) * sizeof (int));
460 /* Set the array entries corresponding to integers in the ranges of RP. */
461 for (i = 0; i < n_rp; i++)
463 unsigned int j;
464 for (j = rp[i].lo; j <= rp[i].hi; j++)
466 printable_field[j] = 1;
470 free (rp);
472 return field_found;
475 /* Read from stream STREAM, printing to standard output any selected bytes. */
477 static void
478 cut_bytes (FILE *stream)
480 unsigned int byte_idx; /* Number of chars in the line so far. */
482 byte_idx = 0;
483 while (1)
485 register int c; /* Each character from the file. */
487 c = getc (stream);
489 if (c == '\n')
491 putchar ('\n');
492 byte_idx = 0;
494 else if (c == EOF)
496 if (byte_idx > 0)
497 putchar ('\n');
498 break;
500 else
502 ++byte_idx;
503 if (print_kth (byte_idx))
505 putchar (c);
511 /* Read from stream STREAM, printing to standard output any selected fields. */
513 static void
514 cut_fields (FILE *stream)
516 int c;
517 unsigned int field_idx;
518 int found_any_selected_field;
519 int buffer_first_field;
520 int empty_input;
522 found_any_selected_field = 0;
523 field_idx = 1;
525 c = getc (stream);
526 empty_input = (c == EOF);
527 if (c != EOF)
528 ungetc (c, stream);
530 /* To support the semantics of the -s flag, we may have to buffer
531 all of the first field to determine whether it is `delimited.'
532 But that is unnecessary if all non-delimited lines must be printed
533 and the first field has been selected, or if non-delimited lines
534 must be suppressed and the first field has *not* been selected.
535 That is because a non-delimited line has exactly one field. */
536 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
538 while (1)
540 if (field_idx == 1 && buffer_first_field)
542 int len;
544 len = getstr (&field_1_buffer, &field_1_bufsize, stream, delim);
545 if (len < 0)
546 break;
548 assert (len != 0);
550 /* If the first field extends to the end of line (it is not
551 delimited) and we are printing all non-delimited lines,
552 print this one. */
553 if (field_1_buffer[len - 1] != delim)
555 if (suppress_non_delimited)
557 /* Empty. */
559 else
561 fwrite (field_1_buffer, sizeof (char), len, stdout);
562 /* Make sure the output line is newline terminated. */
563 if (field_1_buffer[len - 1] != '\n')
564 putchar ('\n');
566 continue;
568 if (print_kth (1))
570 /* Print the field, but not the trailing delimiter. */
571 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
572 found_any_selected_field = 1;
574 ++field_idx;
577 if (c != EOF)
579 if (print_kth (field_idx))
581 if (found_any_selected_field)
583 /* FIXME: use output delimiter here */
584 putchar (delim);
586 found_any_selected_field = 1;
588 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
590 putchar (c);
593 else
595 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
597 /* Empty. */
602 if (c == '\n')
604 c = getc (stream);
605 if (c != EOF)
607 ungetc (c, stream);
608 c = '\n';
612 if (c == delim)
613 ++field_idx;
614 else if (c == '\n' || c == EOF)
616 if (found_any_selected_field
617 || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
618 putchar ('\n');
619 if (c == EOF)
620 break;
621 field_idx = 1;
622 found_any_selected_field = 0;
627 static void
628 cut_stream (FILE *stream)
630 if (operating_mode == byte_mode)
631 cut_bytes (stream);
632 else
633 cut_fields (stream);
636 /* Process file FILE to standard output.
637 Return 0 if successful, 1 if not. */
639 static int
640 cut_file (char *file)
642 FILE *stream;
644 if (STREQ (file, "-"))
646 have_read_stdin = 1;
647 stream = stdin;
649 else
651 stream = fopen (file, "r");
652 if (stream == NULL)
654 error (0, errno, "%s", file);
655 return 1;
659 cut_stream (stream);
661 if (ferror (stream))
663 error (0, errno, "%s", file);
664 return 1;
666 if (STREQ (file, "-"))
667 clearerr (stream); /* Also clear EOF. */
668 else if (fclose (stream) == EOF)
670 error (0, errno, "%s", file);
671 return 1;
673 return 0;
677 main (int argc, char **argv)
679 int optc, exit_status = 0;
681 program_name = argv[0];
682 setlocale (LC_ALL, "");
683 bindtextdomain (PACKAGE, LOCALEDIR);
684 textdomain (PACKAGE);
686 operating_mode = undefined_mode;
688 /* By default, all non-delimited lines are printed. */
689 suppress_non_delimited = 0;
691 delim = '\0';
692 have_read_stdin = 0;
694 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, NULL)) != -1)
696 switch (optc)
698 case 0:
699 break;
701 case 'b':
702 case 'c':
703 /* Build the byte list. */
704 if (operating_mode != undefined_mode)
705 FATAL_ERROR (_("only one type of list may be specified"));
706 operating_mode = byte_mode;
707 if (set_fields (optarg) == 0)
708 FATAL_ERROR (_("missing list of positions"));
709 break;
711 case 'f':
712 /* Build the field list. */
713 if (operating_mode != undefined_mode)
714 FATAL_ERROR (_("only one type of list may be specified"));
715 operating_mode = field_mode;
716 if (set_fields (optarg) == 0)
717 FATAL_ERROR (_("missing list of fields"));
718 break;
720 case 'd':
721 /* New delimiter. */
722 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
723 if (optarg[0] != '\0' && optarg[1] != '\0')
724 FATAL_ERROR (_("the delimiter must be a single character"));
725 delim = optarg[0];
726 break;
728 case 'n':
729 break;
731 case 's':
732 suppress_non_delimited = 1;
733 break;
735 default:
736 usage (2);
740 if (show_version)
742 printf ("cut (%s) %s\n", GNU_PACKAGE, VERSION);
743 exit (EXIT_SUCCESS);
746 if (show_help)
747 usage (0);
749 if (operating_mode == undefined_mode)
750 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
752 if (delim != '\0' && operating_mode != field_mode)
753 FATAL_ERROR (_("a delimiter may be specified only when operating on fields"));
755 if (suppress_non_delimited && operating_mode != field_mode)
756 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
757 \tonly when operating on fields"));
759 if (delim == '\0')
760 delim = '\t';
762 if (optind == argc)
763 exit_status |= cut_file ("-");
764 else
765 for (; optind < argc; optind++)
766 exit_status |= cut_file (argv[optind]);
768 if (have_read_stdin && fclose (stdin) == EOF)
770 error (0, errno, "-");
771 exit_status = 1;
773 if (ferror (stdout) || fclose (stdout) == EOF)
774 error (EXIT_FAILURE, errno, _("write error"));
776 exit (exit_status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);