[EXIT_FAILURE]: Define.
[coreutils.git] / src / cut.c
blob801e8b8e34f0da111387bf636db3873409351c42
1 /* cut - remove parts of lines of files
2 Copyright (C) 1984, 1997, 1998 by David M. Ihnat
4 This program is a total rewrite of the Bell Laboratories Unix(Tm)
5 command of the same name, as of System V. It contains no proprietary
6 code, and therefore may be used without violation of any proprietary
7 agreements whatsoever. However, you will notice that the program is
8 copyrighted by me. This is to assure the program does *not* fall
9 into the public domain. Thus, I may specify just what I am now:
10 This program may be freely copied and distributed, provided this notice
11 remains; it may not be sold for profit without express written consent of
12 the author.
13 Please note that I recreated the behavior of the Unix(Tm) 'cut' command
14 as faithfully as possible; however, I haven't run a full set of regression
15 tests. Thus, the user of this program accepts full responsibility for any
16 effects or loss; in particular, the author is not responsible for any losses,
17 explicit or incidental, that may be incurred through use of this program.
19 I ask that any bugs (and, if possible, fixes) be reported to me when
20 possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us
22 POSIX changes, bug fixes, long-named options, and cleanup
23 by David MacKenzie <djm@gnu.ai.mit.edu>.
25 Rewrite cut_fields and cut_bytes -- Jim Meyering (meyering@comco.com).
27 Options:
28 --bytes=byte-list
29 -b byte-list Print only the bytes in positions listed
30 in BYTE-LIST.
31 Tabs and backspaces are treated like any
32 other character; they take up 1 byte.
34 --characters=character-list
35 -c character-list Print only characters in positions listed
36 in CHARACTER-LIST.
37 The same as -b for now, but
38 internationalization will change that.
39 Tabs and backspaces are treated like any
40 other character; they take up 1 character.
42 --fields=field-list
43 -f field-list Print only the fields listed in FIELD-LIST.
44 Fields are separated by a TAB by default.
46 --delimiter=delim
47 -d delim For -f, fields are separated by the first
48 character in DELIM instead of TAB.
50 -n Do not split multibyte chars (no-op for now).
52 --only-delimited
53 -s For -f, do not print lines that do not contain
54 the field separator character.
56 The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers
57 or ranges separated by commas. The first byte, character, and field
58 are numbered 1.
60 A FILE of `-' means standard input. */
62 #include <config.h>
64 #include <stdio.h>
65 #include <assert.h>
66 #include <getopt.h>
67 #include <sys/types.h>
68 #include "system.h"
69 #include "error.h"
71 char *xstrdup ();
73 #define FATAL_ERROR(s) \
74 do \
75 { \
76 error (0, 0, (s)); \
77 usage (2); \
78 } \
79 while (0)
81 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
82 space if necessary. Update local variable N_RP. When allocating,
83 update global variable N_RP_ALLOCATED. */
85 #define ADD_RANGE_PAIR(rp, low, high) \
86 do \
87 { \
88 if (n_rp >= n_rp_allocated) \
89 { \
90 n_rp_allocated *= 2; \
91 (rp) = (struct range_pair *) xrealloc ((char *) (rp), \
92 n_rp_allocated * sizeof (*(rp))); \
93 } \
94 rp[n_rp].lo = (low); \
95 rp[n_rp].hi = (high); \
96 ++n_rp; \
97 } \
98 while (0)
100 struct range_pair
102 unsigned int lo;
103 unsigned int hi;
106 /* This buffer is used to support the semantics of the -s option
107 (or lack of same) when the specified field list includes (does
108 not include) the first field. In both of those cases, the entire
109 first field must be read into this buffer to determine whether it
110 is followed by a delimiter or a newline before any of it may be
111 output. Otherwise, cut_fields can do the job without using this
112 buffer. */
113 static char *field_1_buffer;
115 /* The number of bytes allocated for FIELD_1_BUFFER. */
116 static int field_1_bufsize;
118 /* The largest field or byte index used as an endpoint of a closed
119 or degenerate range specification; this doesn't include the starting
120 index of right-open-ended ranges. For example, with either range spec
121 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
122 static unsigned int max_range_endpoint;
124 /* If nonzero, this is the index of the first field in a range that goes
125 to end of line. */
126 static unsigned int eol_range_start;
128 /* In byte mode, which bytes to output.
129 In field mode, which DELIM-separated fields to output.
130 Both bytes and fields are numbered starting with 1,
131 so the zeroth element of this array is unused.
132 A field or byte K has been selected if
133 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
134 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
135 static int *printable_field;
137 enum operating_mode
139 undefined_mode,
141 /* Output characters that are in the given bytes. */
142 byte_mode,
144 /* Output the given delimeter-separated fields. */
145 field_mode
148 /* The name this program was run with. */
149 char *program_name;
151 static enum operating_mode operating_mode;
153 /* If nonzero do not output lines containing no delimeter characters.
154 Otherwise, all such lines are printed. This option is valid only
155 with field mode. */
156 static int suppress_non_delimited;
158 /* The delimeter character for field mode. */
159 static int delim;
161 /* The length of output_delimiter_string. */
162 static size_t output_delimiter_length;
164 /* The output field separator string. Defaults to the 1-character
165 string consisting of the input delimiter. */
166 static char *output_delimiter_string;
168 /* Nonzero if we have ever read standard input. */
169 static int have_read_stdin;
171 /* If nonzero, display usage information and exit. */
172 static int show_help;
174 /* If nonzero, print the version on standard output then exit. */
175 static int show_version;
177 static struct option const longopts[] =
179 {"bytes", required_argument, 0, 'b'},
180 {"characters", required_argument, 0, 'c'},
181 {"fields", required_argument, 0, 'f'},
182 {"delimiter", required_argument, 0, 'd'},
183 {"only-delimited", no_argument, 0, 's'},
184 {"output-delimiter", required_argument, 0, CHAR_MAX + 1},
185 {"help", no_argument, &show_help, 1},
186 {"version", no_argument, &show_version, 1},
187 {0, 0, 0, 0}
190 static void
191 usage (int status)
193 if (status != 0)
194 fprintf (stderr, _("Try `%s --help' for more information.\n"),
195 program_name);
196 else
198 printf (_("\
199 Usage: %s [OPTION]... [FILE]...\n\
201 program_name);
202 printf (_("\
203 Print selected parts of lines from each FILE to standard output.\n\
205 -b, --bytes=LIST output only these bytes\n\
206 -c, --characters=LIST output only these characters\n\
207 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
208 -f, --fields=LIST output only these fields\n\
209 -n (ignored)\n\
210 -s, --only-delimited do not print lines not containing delimiters\n\
211 --output-delimiter=STRING use STRING as the output delimiter\n\
212 the default is to use the input delimiter\n\
213 --help display this help and exit\n\
214 --version output version information and exit\n\
216 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
217 range, or many ranges separated by commas. Each range is one of:\n\
219 N N'th byte, character or field, counted from 1\n\
220 N- from N'th byte, character or field, to end of line\n\
221 N-M from N'th to M'th (included) byte, character or field\n\
222 -M from first to M'th (included) byte, character or field\n\
224 With no FILE, or when FILE is -, read standard input.\n\
225 "));
226 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
228 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
231 /* The following function was copied from getline.c, but with these changes:
232 - Read up to and including a newline or TERMINATOR, whichever comes first.
233 The original does not treat newline specially.
234 - Remove unused argument, OFFSET.
235 - Use xmalloc and xrealloc instead of malloc and realloc.
236 - Declare this function static. */
238 /* Always add at least this many bytes when extending the buffer. */
239 #define MIN_CHUNK 64
241 /* Read up to (and including) a newline or TERMINATOR from STREAM into
242 *LINEPTR (and null-terminate it). *LINEPTR is a pointer returned from
243 xmalloc (or NULL), pointing to *N characters of space. It is
244 xrealloc'd as necessary. Return the number of characters read (not
245 including the null terminator), or -1 on error or EOF. */
247 static int
248 getstr (char **lineptr, int *n, FILE *stream, char terminator)
250 int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
251 char *read_pos; /* Where we're reading into *LINEPTR. */
253 if (!lineptr || !n || !stream)
254 return -1;
256 if (!*lineptr)
258 *n = MIN_CHUNK;
259 *lineptr = (char *) xmalloc (*n);
260 if (!*lineptr)
261 return -1;
264 nchars_avail = *n;
265 read_pos = *lineptr;
267 for (;;)
269 register int c = getc (stream);
271 /* We always want at least one char left in the buffer, since we
272 always (unless we get an error while reading the first char)
273 NUL-terminate the line buffer. */
275 assert (*n - nchars_avail == read_pos - *lineptr);
276 if (nchars_avail < 1)
278 if (*n > MIN_CHUNK)
279 *n *= 2;
280 else
281 *n += MIN_CHUNK;
283 nchars_avail = *n + *lineptr - read_pos;
284 *lineptr = xrealloc (*lineptr, *n);
285 if (!*lineptr)
286 return -1;
287 read_pos = *n - nchars_avail + *lineptr;
288 assert (*n - nchars_avail == read_pos - *lineptr);
291 if (feof (stream) || ferror (stream))
293 /* Return partial line, if any. */
294 if (read_pos == *lineptr)
295 return -1;
296 else
297 break;
300 *read_pos++ = c;
301 nchars_avail--;
303 if (c == terminator || c == '\n')
304 /* Return the line. */
305 break;
308 /* Done - NUL terminate and return the number of chars read. */
309 *read_pos = '\0';
311 return read_pos - *lineptr;
314 static int
315 print_kth (unsigned int k)
317 return ((0 < eol_range_start && eol_range_start <= k)
318 || (k <= max_range_endpoint && printable_field[k]));
321 /* Given the list of field or byte range specifications FIELDSTR, set
322 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
323 array. If there is a right-open-ended range, set EOL_RANGE_START
324 to its starting index. FIELDSTR should be composed of one or more
325 numbers or ranges of numbers, separated by blanks or commas.
326 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
327 through end of line. Return nonzero if FIELDSTR contains at least
328 one field specification, zero otherwise. */
330 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
331 of some huge input file? This function shouldn't have to alloate a table
332 of a million ints just so we can test every field < 10^6 with an array
333 dereference. Instead, consider using a dynamic hash table. It would be
334 simpler and nearly as good a solution to use a 32K x 4-byte table with
335 one bit per field index instead of a whole `int' per index. */
337 static int
338 set_fields (const char *fieldstr)
340 unsigned int initial = 1; /* Value of first number in a range. */
341 unsigned int value = 0; /* If nonzero, a number being accumulated. */
342 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
343 int field_found = 0; /* Non-zero if at least one field spec
344 has been processed. */
346 struct range_pair *rp;
347 unsigned int n_rp;
348 unsigned int n_rp_allocated;
349 unsigned int i;
351 n_rp = 0;
352 n_rp_allocated = 16;
353 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
355 /* Collect and store in RP the range end points.
356 It also sets EOL_RANGE_START if appropriate. */
358 for (;;)
360 if (*fieldstr == '-')
362 /* Starting a range. */
363 if (dash_found)
364 FATAL_ERROR (_("invalid byte or field list"));
365 dash_found++;
366 fieldstr++;
368 if (value)
370 initial = value;
371 value = 0;
373 else
374 initial = 1;
376 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
378 /* Ending the string, or this field/byte sublist. */
379 if (dash_found)
381 dash_found = 0;
383 /* A range. Possibilites: -n, m-n, n-.
384 In any case, `initial' contains the start of the range. */
385 if (value == 0)
387 /* `n-'. From `initial' to end of line. */
388 eol_range_start = initial;
389 field_found = 1;
391 else
393 /* `m-n' or `-n' (1-n). */
394 if (value < initial)
395 FATAL_ERROR (_("invalid byte or field list"));
397 /* Is there already a range going to end of line? */
398 if (eol_range_start != 0)
400 /* Yes. Is the new sequence already contained
401 in the old one? If so, no processing is
402 necessary. */
403 if (initial < eol_range_start)
405 /* No, the new sequence starts before the
406 old. Does the old range going to end of line
407 extend into the new range? */
408 if (value + 1 >= eol_range_start)
410 /* Yes. Simply move the end of line marker. */
411 eol_range_start = initial;
413 else
415 /* No. A simple range, before and disjoint from
416 the range going to end of line. Fill it. */
417 ADD_RANGE_PAIR (rp, initial, value);
420 /* In any case, some fields were selected. */
421 field_found = 1;
424 else
426 /* There is no range going to end of line. */
427 ADD_RANGE_PAIR (rp, initial, value);
428 field_found = 1;
430 value = 0;
433 else if (value != 0)
435 /* A simple field number, not a range. */
436 ADD_RANGE_PAIR (rp, value, value);
437 value = 0;
438 field_found = 1;
441 if (*fieldstr == '\0')
443 break;
446 fieldstr++;
448 else if (ISDIGIT (*fieldstr))
450 /* FIXME: detect overflow? */
451 value = 10 * value + *fieldstr - '0';
452 fieldstr++;
454 else
455 FATAL_ERROR (_("invalid byte or field list"));
458 max_range_endpoint = 0;
459 for (i = 0; i < n_rp; i++)
461 if (rp[i].hi > max_range_endpoint)
462 max_range_endpoint = rp[i].hi;
465 /* Allocate an array large enough so that it may be indexed by
466 the field numbers corresponding to all finite ranges
467 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
469 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
470 memset (printable_field, 0, (max_range_endpoint + 1) * sizeof (int));
472 /* Set the array entries corresponding to integers in the ranges of RP. */
473 for (i = 0; i < n_rp; i++)
475 unsigned int j;
476 for (j = rp[i].lo; j <= rp[i].hi; j++)
478 printable_field[j] = 1;
482 free (rp);
484 return field_found;
487 /* Read from stream STREAM, printing to standard output any selected bytes. */
489 static void
490 cut_bytes (FILE *stream)
492 unsigned int byte_idx; /* Number of chars in the line so far. */
494 byte_idx = 0;
495 while (1)
497 register int c; /* Each character from the file. */
499 c = getc (stream);
501 if (c == '\n')
503 putchar ('\n');
504 byte_idx = 0;
506 else if (c == EOF)
508 if (byte_idx > 0)
509 putchar ('\n');
510 break;
512 else
514 ++byte_idx;
515 if (print_kth (byte_idx))
517 putchar (c);
523 /* Read from stream STREAM, printing to standard output any selected fields. */
525 static void
526 cut_fields (FILE *stream)
528 int c;
529 unsigned int field_idx;
530 int found_any_selected_field;
531 int buffer_first_field;
532 int empty_input;
534 found_any_selected_field = 0;
535 field_idx = 1;
537 c = getc (stream);
538 empty_input = (c == EOF);
539 if (c != EOF)
540 ungetc (c, stream);
542 /* To support the semantics of the -s flag, we may have to buffer
543 all of the first field to determine whether it is `delimited.'
544 But that is unnecessary if all non-delimited lines must be printed
545 and the first field has been selected, or if non-delimited lines
546 must be suppressed and the first field has *not* been selected.
547 That is because a non-delimited line has exactly one field. */
548 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
550 while (1)
552 if (field_idx == 1 && buffer_first_field)
554 int len;
556 len = getstr (&field_1_buffer, &field_1_bufsize, stream, delim);
557 if (len < 0)
558 break;
560 assert (len != 0);
562 /* If the first field extends to the end of line (it is not
563 delimited) and we are printing all non-delimited lines,
564 print this one. */
565 if (field_1_buffer[len - 1] != delim)
567 if (suppress_non_delimited)
569 /* Empty. */
571 else
573 fwrite (field_1_buffer, sizeof (char), len, stdout);
574 /* Make sure the output line is newline terminated. */
575 if (field_1_buffer[len - 1] != '\n')
576 putchar ('\n');
578 continue;
580 if (print_kth (1))
582 /* Print the field, but not the trailing delimiter. */
583 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
584 found_any_selected_field = 1;
586 ++field_idx;
589 if (c != EOF)
591 if (print_kth (field_idx))
593 if (found_any_selected_field)
595 fwrite (output_delimiter_string, sizeof (char),
596 output_delimiter_length, stdout);
598 found_any_selected_field = 1;
600 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
602 putchar (c);
605 else
607 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
609 /* Empty. */
614 if (c == '\n')
616 c = getc (stream);
617 if (c != EOF)
619 ungetc (c, stream);
620 c = '\n';
624 if (c == delim)
625 ++field_idx;
626 else if (c == '\n' || c == EOF)
628 if (found_any_selected_field
629 || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
630 putchar ('\n');
631 if (c == EOF)
632 break;
633 field_idx = 1;
634 found_any_selected_field = 0;
639 static void
640 cut_stream (FILE *stream)
642 if (operating_mode == byte_mode)
643 cut_bytes (stream);
644 else
645 cut_fields (stream);
648 /* Process file FILE to standard output.
649 Return 0 if successful, 1 if not. */
651 static int
652 cut_file (char *file)
654 FILE *stream;
656 if (STREQ (file, "-"))
658 have_read_stdin = 1;
659 stream = stdin;
661 else
663 stream = fopen (file, "r");
664 if (stream == NULL)
666 error (0, errno, "%s", file);
667 return 1;
671 cut_stream (stream);
673 if (ferror (stream))
675 error (0, errno, "%s", file);
676 return 1;
678 if (STREQ (file, "-"))
679 clearerr (stream); /* Also clear EOF. */
680 else if (fclose (stream) == EOF)
682 error (0, errno, "%s", file);
683 return 1;
685 return 0;
689 main (int argc, char **argv)
691 int optc, exit_status = 0;
692 int delim_specified = 0;
694 program_name = argv[0];
695 setlocale (LC_ALL, "");
696 bindtextdomain (PACKAGE, LOCALEDIR);
697 textdomain (PACKAGE);
699 operating_mode = undefined_mode;
701 /* By default, all non-delimited lines are printed. */
702 suppress_non_delimited = 0;
704 delim = '\0';
705 have_read_stdin = 0;
707 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, NULL)) != -1)
709 switch (optc)
711 case 0:
712 break;
714 case 'b':
715 case 'c':
716 /* Build the byte list. */
717 if (operating_mode != undefined_mode)
718 FATAL_ERROR (_("only one type of list may be specified"));
719 operating_mode = byte_mode;
720 if (set_fields (optarg) == 0)
721 FATAL_ERROR (_("missing list of positions"));
722 break;
724 case 'f':
725 /* Build the field list. */
726 if (operating_mode != undefined_mode)
727 FATAL_ERROR (_("only one type of list may be specified"));
728 operating_mode = field_mode;
729 if (set_fields (optarg) == 0)
730 FATAL_ERROR (_("missing list of fields"));
731 break;
733 case 'd':
734 /* New delimiter. */
735 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
736 if (optarg[0] != '\0' && optarg[1] != '\0')
737 FATAL_ERROR (_("the delimiter must be a single character"));
738 delim = optarg[0];
739 delim_specified = 1;
740 break;
742 case CHAR_MAX + 1:
743 /* Interpret --output-delimiter='' to mean
744 `use the NUL byte as the delimiter.' */
745 output_delimiter_length = (optarg[0] == '\0'
746 ? 1 : strlen (optarg));
747 output_delimiter_string = xstrdup (optarg);
748 break;
750 case 'n':
751 break;
753 case 's':
754 suppress_non_delimited = 1;
755 break;
757 default:
758 usage (2);
762 if (show_version)
764 printf ("cut (%s) %s\n", GNU_PACKAGE, VERSION);
765 exit (EXIT_SUCCESS);
768 if (show_help)
769 usage (0);
771 if (operating_mode == undefined_mode)
772 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
774 if (delim != '\0' && operating_mode != field_mode)
775 FATAL_ERROR (_("a delimiter may be specified only when operating on fields"));
777 if (suppress_non_delimited && operating_mode != field_mode)
778 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
779 \tonly when operating on fields"));
781 if (!delim_specified)
782 delim = '\t';
784 if (output_delimiter_string == NULL)
786 static char dummy[2];
787 dummy[0] = delim;
788 dummy[1] = '\0';
789 output_delimiter_string = dummy;
790 output_delimiter_length = 1;
793 if (optind == argc)
794 exit_status |= cut_file ("-");
795 else
796 for (; optind < argc; optind++)
797 exit_status |= cut_file (argv[optind]);
799 if (have_read_stdin && fclose (stdin) == EOF)
801 error (0, errno, "-");
802 exit_status = 1;
804 if (ferror (stdout) || fclose (stdout) == EOF)
805 error (EXIT_FAILURE, errno, _("write error"));
807 exit (exit_status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);