.
[coreutils.git] / src / cut.c
blobf5a79df59c7e38663acbba4e2214b213a6b9106f
1 /* cut - remove parts of lines of files
2 Copyright (C) 1984, 1997 by David M. Ihnat
4 This program is a total rewrite of the Bell Laboratories Unix(Tm)
5 command of the same name, as of System V. It contains no proprietary
6 code, and therefore may be used without violation of any proprietary
7 agreements whatsoever. However, you will notice that the program is
8 copyrighted by me. This is to assure the program does *not* fall
9 into the public domain. Thus, I may specify just what I am now:
10 This program may be freely copied and distributed, provided this notice
11 remains; it may not be sold for profit without express written consent of
12 the author.
13 Please note that I recreated the behavior of the Unix(Tm) 'cut' command
14 as faithfully as possible; however, I haven't run a full set of regression
15 tests. Thus, the user of this program accepts full responsibility for any
16 effects or loss; in particular, the author is not responsible for any losses,
17 explicit or incidental, that may be incurred through use of this program.
19 I ask that any bugs (and, if possible, fixes) be reported to me when
20 possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us
22 POSIX changes, bug fixes, long-named options, and cleanup
23 by David MacKenzie <djm@gnu.ai.mit.edu>.
25 Rewrite cut_fields and cut_bytes -- Jim Meyering (meyering@comco.com).
27 Options:
28 --bytes=byte-list
29 -b byte-list Print only the bytes in positions listed
30 in BYTE-LIST.
31 Tabs and backspaces are treated like any
32 other character; they take up 1 byte.
34 --characters=character-list
35 -c character-list Print only characters in positions listed
36 in CHARACTER-LIST.
37 The same as -b for now, but
38 internationalization will change that.
39 Tabs and backspaces are treated like any
40 other character; they take up 1 character.
42 --fields=field-list
43 -f field-list Print only the fields listed in FIELD-LIST.
44 Fields are separated by a TAB by default.
46 --delimiter=delim
47 -d delim For -f, fields are separated by the first
48 character in DELIM instead of TAB.
50 -n Do not split multibyte chars (no-op for now).
52 --only-delimited
53 -s For -f, do not print lines that do not contain
54 the field separator character.
56 The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers
57 or ranges separated by commas. The first byte, character, and field
58 are numbered 1.
60 A FILE of `-' means standard input. */
62 #include <config.h>
64 /* Get isblank from GNU libc. */
65 #define _GNU_SOURCE
67 #include <stdio.h>
69 #define NDEBUG
70 #include <assert.h>
72 #include <getopt.h>
73 #include <sys/types.h>
74 #include "system.h"
75 #include "error.h"
77 #define FATAL_ERROR(s) \
78 do \
79 { \
80 error (0, 0, (s)); \
81 usage (2); \
82 } \
83 while (0)
85 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
86 space if necessary. Update local variable N_RP. When allocating,
87 update global variable N_RP_ALLOCATED. */
89 #define ADD_RANGE_PAIR(rp, low, high) \
90 do \
91 { \
92 if (n_rp >= n_rp_allocated) \
93 { \
94 n_rp_allocated *= 2; \
95 (rp) = (struct range_pair *) xrealloc ((char *) (rp), \
96 n_rp_allocated * sizeof (*(rp))); \
97 } \
98 rp[n_rp].lo = (low); \
99 rp[n_rp].hi = (high); \
100 ++n_rp; \
102 while (0)
104 struct range_pair
106 unsigned int lo;
107 unsigned int hi;
110 /* This buffer is used to support the semantics of the -s option
111 (or lack of same) when the specified field list includes (does
112 not include) the first field. In both of those cases, the entire
113 first field must be read into this buffer to determine whether it
114 is followed by a delimiter or a newline before any of it may be
115 output. Otherwise, cut_fields can do the job without using this
116 buffer. */
117 static char *field_1_buffer;
119 /* The number of bytes allocated for FIELD_1_BUFFER. */
120 static int field_1_bufsize;
122 /* The largest field or byte index used as an endpoint of a closed
123 or degenerate range specification; this doesn't include the starting
124 index of right-open-ended ranges. For example, with either range spec
125 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
126 static unsigned int max_range_endpoint;
128 /* If nonzero, this is the index of the first field in a range that goes
129 to end of line. */
130 static unsigned int eol_range_start;
132 /* In byte mode, which bytes to output.
133 In field mode, which DELIM-separated fields to output.
134 Both bytes and fields are numbered starting with 1,
135 so the zeroth element of this array is unused.
136 A field or byte K has been selected if
137 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
138 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
139 static int *printable_field;
141 enum operating_mode
143 undefined_mode,
145 /* Output characters that are in the given bytes. */
146 byte_mode,
148 /* Output the given delimeter-separated fields. */
149 field_mode
152 /* The name this program was run with. */
153 char *program_name;
155 static enum operating_mode operating_mode;
157 /* If nonzero do not output lines containing no delimeter characters.
158 Otherwise, all such lines are printed. This option is valid only
159 with field mode. */
160 static int suppress_non_delimited;
162 /* The delimeter character for field mode. */
163 static int delim;
165 /* Nonzero if we have ever read standard input. */
166 static int have_read_stdin;
168 /* If nonzero, display usage information and exit. */
169 static int show_help;
171 /* If nonzero, print the version on standard output then exit. */
172 static int show_version;
174 static struct option const longopts[] =
176 {"bytes", required_argument, 0, 'b'},
177 {"characters", required_argument, 0, 'c'},
178 {"fields", required_argument, 0, 'f'},
179 {"delimiter", required_argument, 0, 'd'},
180 {"only-delimited", no_argument, 0, 's'},
181 {"help", no_argument, &show_help, 1},
182 {"version", no_argument, &show_version, 1},
183 {0, 0, 0, 0}
186 static void
187 usage (int status)
189 if (status != 0)
190 fprintf (stderr, _("Try `%s --help' for more information.\n"),
191 program_name);
192 else
194 printf (_("\
195 Usage: %s [OPTION]... [FILE]...\n\
197 program_name);
198 printf (_("\
199 Print selected parts of lines from each FILE to standard output.\n\
201 -b, --bytes=LIST output only these bytes\n\
202 -c, --characters=LIST output only these characters\n\
203 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
204 -f, --fields=LIST output only these fields\n\
205 -n (ignored)\n\
206 -s, --only-delimited do not print lines not containing delimiters\n\
207 --help display this help and exit\n\
208 --version output version information and exit\n\
210 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
211 range, or many ranges separated by commas. Each range is one of:\n\
213 N N'th byte, character or field, counted from 1\n\
214 N- from N'th byte, character or field, to end of line\n\
215 N-M from N'th to M'th (included) byte, character or field\n\
216 -M from first to M'th (included) byte, character or field\n\
218 With no FILE, or when FILE is -, read standard input.\n\
219 "));
220 puts (_("\nReport bugs to <textutils-bugs@gnu.org>."));
222 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
225 /* The following function was copied from getline.c, but with these changes:
226 - Read up to and including a newline or TERMINATOR, whichever comes first.
227 The original does not treat newline specially.
228 - Remove unused argument, OFFSET.
229 - Use xmalloc and xrealloc instead of malloc and realloc.
230 - Declare this function static. */
232 /* Always add at least this many bytes when extending the buffer. */
233 #define MIN_CHUNK 64
235 /* Read up to (and including) a newline or TERMINATOR from STREAM into
236 *LINEPTR (and null-terminate it). *LINEPTR is a pointer returned from
237 xmalloc (or NULL), pointing to *N characters of space. It is
238 xrealloc'd as necessary. Return the number of characters read (not
239 including the null terminator), or -1 on error or EOF. */
241 static int
242 getstr (char **lineptr, int *n, FILE *stream, char terminator)
244 int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
245 char *read_pos; /* Where we're reading into *LINEPTR. */
247 if (!lineptr || !n || !stream)
248 return -1;
250 if (!*lineptr)
252 *n = MIN_CHUNK;
253 *lineptr = (char *) xmalloc (*n);
254 if (!*lineptr)
255 return -1;
258 nchars_avail = *n;
259 read_pos = *lineptr;
261 for (;;)
263 register int c = getc (stream);
265 /* We always want at least one char left in the buffer, since we
266 always (unless we get an error while reading the first char)
267 NUL-terminate the line buffer. */
269 assert (*n - nchars_avail == read_pos - *lineptr);
270 if (nchars_avail < 1)
272 if (*n > MIN_CHUNK)
273 *n *= 2;
274 else
275 *n += MIN_CHUNK;
277 nchars_avail = *n + *lineptr - read_pos;
278 *lineptr = xrealloc (*lineptr, *n);
279 if (!*lineptr)
280 return -1;
281 read_pos = *n - nchars_avail + *lineptr;
282 assert (*n - nchars_avail == read_pos - *lineptr);
285 if (feof (stream) || ferror (stream))
287 /* Return partial line, if any. */
288 if (read_pos == *lineptr)
289 return -1;
290 else
291 break;
294 *read_pos++ = c;
295 nchars_avail--;
297 if (c == terminator || c == '\n')
298 /* Return the line. */
299 break;
302 /* Done - NUL terminate and return the number of chars read. */
303 *read_pos = '\0';
305 return read_pos - *lineptr;
308 static int
309 print_kth (unsigned int k)
311 return ((0 < eol_range_start && eol_range_start <= k)
312 || (k <= max_range_endpoint && printable_field[k]));
315 /* Given the list of field or byte range specifications FIELDSTR, set
316 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
317 array. If there is a right-open-ended range, set EOL_RANGE_START
318 to its starting index. FIELDSTR should be composed of one or more
319 numbers or ranges of numbers, separated by blanks or commas.
320 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
321 through end of line. Return nonzero if FIELDSTR contains at least
322 one field specification, zero otherwise. */
324 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
325 of some huge input file? This function shouldn't have to alloate a table
326 of a million ints just so we can test every field < 10^6 with an array
327 dereference. Instead, consider using a dynamic hash table. It would be
328 simpler and nearly as good a solution to use a 32K x 4-byte table with
329 one bit per field index instead of a whole `int' per index. */
331 static int
332 set_fields (const char *fieldstr)
334 unsigned int initial = 1; /* Value of first number in a range. */
335 unsigned int value = 0; /* If nonzero, a number being accumulated. */
336 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
337 int field_found = 0; /* Non-zero if at least one field spec
338 has been processed. */
340 struct range_pair *rp;
341 unsigned int n_rp;
342 unsigned int n_rp_allocated;
343 unsigned int i;
345 n_rp = 0;
346 n_rp_allocated = 16;
347 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
349 /* Collect and store in RP the range end points.
350 It also sets EOL_RANGE_START if appropriate. */
352 for (;;)
354 if (*fieldstr == '-')
356 /* Starting a range. */
357 if (dash_found)
358 FATAL_ERROR (_("invalid byte or field list"));
359 dash_found++;
360 fieldstr++;
362 if (value)
364 initial = value;
365 value = 0;
367 else
368 initial = 1;
370 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
372 /* Ending the string, or this field/byte sublist. */
373 if (dash_found)
375 dash_found = 0;
377 /* A range. Possibilites: -n, m-n, n-.
378 In any case, `initial' contains the start of the range. */
379 if (value == 0)
381 /* `n-'. From `initial' to end of line. */
382 eol_range_start = initial;
383 field_found = 1;
385 else
387 /* `m-n' or `-n' (1-n). */
388 if (value < initial)
389 FATAL_ERROR (_("invalid byte or field list"));
391 /* Is there already a range going to end of line? */
392 if (eol_range_start != 0)
394 /* Yes. Is the new sequence already contained
395 in the old one? If so, no processing is
396 necessary. */
397 if (initial < eol_range_start)
399 /* No, the new sequence starts before the
400 old. Does the old range going to end of line
401 extend into the new range? */
402 if (value + 1 >= eol_range_start)
404 /* Yes. Simply move the end of line marker. */
405 eol_range_start = initial;
407 else
409 /* No. A simple range, before and disjoint from
410 the range going to end of line. Fill it. */
411 ADD_RANGE_PAIR (rp, initial, value);
414 /* In any case, some fields were selected. */
415 field_found = 1;
418 else
420 /* There is no range going to end of line. */
421 ADD_RANGE_PAIR (rp, initial, value);
422 field_found = 1;
424 value = 0;
427 else if (value != 0)
429 /* A simple field number, not a range. */
430 ADD_RANGE_PAIR (rp, value, value);
431 value = 0;
432 field_found = 1;
435 if (*fieldstr == '\0')
437 break;
440 fieldstr++;
442 else if (ISDIGIT (*fieldstr))
444 /* FIXME: detect overflow? */
445 value = 10 * value + *fieldstr - '0';
446 fieldstr++;
448 else
449 FATAL_ERROR (_("invalid byte or field list"));
452 max_range_endpoint = 0;
453 for (i = 0; i < n_rp; i++)
455 if (rp[i].hi > max_range_endpoint)
456 max_range_endpoint = rp[i].hi;
459 /* Allocate an array large enough so that it may be indexed by
460 the field numbers corresponding to all finite ranges
461 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
463 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
464 memset (printable_field, 0, (max_range_endpoint + 1) * sizeof (int));
466 /* Set the array entries corresponding to integers in the ranges of RP. */
467 for (i = 0; i < n_rp; i++)
469 unsigned int j;
470 for (j = rp[i].lo; j <= rp[i].hi; j++)
472 printable_field[j] = 1;
476 free (rp);
478 return field_found;
481 /* Read from stream STREAM, printing to standard output any selected bytes. */
483 static void
484 cut_bytes (FILE *stream)
486 unsigned int byte_idx; /* Number of chars in the line so far. */
488 byte_idx = 0;
489 while (1)
491 register int c; /* Each character from the file. */
493 c = getc (stream);
495 if (c == '\n')
497 putchar ('\n');
498 byte_idx = 0;
500 else if (c == EOF)
502 if (byte_idx > 0)
503 putchar ('\n');
504 break;
506 else
508 ++byte_idx;
509 if (print_kth (byte_idx))
511 putchar (c);
517 /* Read from stream STREAM, printing to standard output any selected fields. */
519 static void
520 cut_fields (FILE *stream)
522 int c;
523 unsigned int field_idx;
524 int found_any_selected_field;
525 int buffer_first_field;
526 int empty_input;
528 found_any_selected_field = 0;
529 field_idx = 1;
531 c = getc (stream);
532 empty_input = (c == EOF);
533 if (c != EOF)
534 ungetc (c, stream);
536 /* To support the semantics of the -s flag, we may have to buffer
537 all of the first field to determine whether it is `delimited.'
538 But that is unnecessary if all non-delimited lines must be printed
539 and the first field has been selected, or if non-delimited lines
540 must be suppressed and the first field has *not* been selected.
541 That is because a non-delimited line has exactly one field. */
542 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
544 while (1)
546 if (field_idx == 1 && buffer_first_field)
548 int len;
550 len = getstr (&field_1_buffer, &field_1_bufsize, stream, delim);
551 if (len < 0)
552 break;
554 assert (len != 0);
556 /* If the first field extends to the end of line (it is not
557 delimited) and we are printing all non-delimited lines,
558 print this one. */
559 if (field_1_buffer[len - 1] != delim)
561 if (suppress_non_delimited)
563 /* Empty. */
565 else
567 fwrite (field_1_buffer, sizeof (char), len, stdout);
568 /* Make sure the output line is newline terminated. */
569 if (field_1_buffer[len - 1] != '\n')
570 putchar ('\n');
572 continue;
574 if (print_kth (1))
576 /* Print the field, but not the trailing delimiter. */
577 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
578 found_any_selected_field = 1;
580 ++field_idx;
583 if (c != EOF)
585 if (print_kth (field_idx))
587 if (found_any_selected_field)
588 putchar (delim);
589 found_any_selected_field = 1;
591 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
593 putchar (c);
596 else
598 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
600 /* Empty. */
605 if (c == '\n')
607 c = getc (stream);
608 if (c != EOF)
610 ungetc (c, stream);
611 c = '\n';
615 if (c == delim)
616 ++field_idx;
617 else if (c == '\n' || c == EOF)
619 if (found_any_selected_field
620 || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
621 putchar ('\n');
622 if (c == EOF)
623 break;
624 field_idx = 1;
625 found_any_selected_field = 0;
630 static void
631 cut_stream (FILE *stream)
633 if (operating_mode == byte_mode)
634 cut_bytes (stream);
635 else
636 cut_fields (stream);
639 /* Process file FILE to standard output.
640 Return 0 if successful, 1 if not. */
642 static int
643 cut_file (char *file)
645 FILE *stream;
647 if (!strcmp (file, "-"))
649 have_read_stdin = 1;
650 stream = stdin;
652 else
654 stream = fopen (file, "r");
655 if (stream == NULL)
657 error (0, errno, "%s", file);
658 return 1;
662 cut_stream (stream);
664 if (ferror (stream))
666 error (0, errno, "%s", file);
667 return 1;
669 if (!strcmp (file, "-"))
670 clearerr (stream); /* Also clear EOF. */
671 else if (fclose (stream) == EOF)
673 error (0, errno, "%s", file);
674 return 1;
676 return 0;
680 main (int argc, char **argv)
682 int optc, exit_status = 0;
684 program_name = argv[0];
685 setlocale (LC_ALL, "");
686 bindtextdomain (PACKAGE, LOCALEDIR);
687 textdomain (PACKAGE);
689 operating_mode = undefined_mode;
691 /* By default, all non-delimited lines are printed. */
692 suppress_non_delimited = 0;
694 delim = '\0';
695 have_read_stdin = 0;
697 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, NULL)) != -1)
699 switch (optc)
701 case 0:
702 break;
704 case 'b':
705 case 'c':
706 /* Build the byte list. */
707 if (operating_mode != undefined_mode)
708 FATAL_ERROR (_("only one type of list may be specified"));
709 operating_mode = byte_mode;
710 if (set_fields (optarg) == 0)
711 FATAL_ERROR (_("missing list of positions"));
712 break;
714 case 'f':
715 /* Build the field list. */
716 if (operating_mode != undefined_mode)
717 FATAL_ERROR (_("only one type of list may be specified"));
718 operating_mode = field_mode;
719 if (set_fields (optarg) == 0)
720 FATAL_ERROR (_("missing list of fields"));
721 break;
723 case 'd':
724 /* New delimiter. */
725 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
726 if (optarg[0] != '\0' && optarg[1] != '\0')
727 FATAL_ERROR (_("the delimiter must be a single character"));
728 delim = optarg[0];
729 break;
731 case 'n':
732 break;
734 case 's':
735 suppress_non_delimited = 1;
736 break;
738 default:
739 usage (2);
743 if (show_version)
745 printf ("cut (%s) %s\n", GNU_PACKAGE, VERSION);
746 exit (EXIT_SUCCESS);
749 if (show_help)
750 usage (0);
752 if (operating_mode == undefined_mode)
753 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
755 if (delim != '\0' && operating_mode != field_mode)
756 FATAL_ERROR (_("a delimiter may be specified only when operating on fields"));
758 if (suppress_non_delimited && operating_mode != field_mode)
759 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
760 \tonly when operating on fields"));
762 if (delim == '\0')
763 delim = '\t';
765 if (optind == argc)
766 exit_status |= cut_file ("-");
767 else
768 for (; optind < argc; optind++)
769 exit_status |= cut_file (argv[optind]);
771 if (have_read_stdin && fclose (stdin) == EOF)
773 error (0, errno, "-");
774 exit_status = 1;
776 if (ferror (stdout) || fclose (stdout) == EOF)
777 error (EXIT_FAILURE, errno, _("write error"));
779 exit (exit_status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);