.
[coreutils.git] / src / cut.c
blobcd103d3ed519c0c63ea0d75e262986e1db096cd1
1 /* cut - remove parts of lines of files
2 Copyright (C) 1984 by David M. Ihnat
4 This program is a total rewrite of the Bell Laboratories Unix(Tm)
5 command of the same name, as of System V. It contains no proprietary
6 code, and therefore may be used without violation of any proprietary
7 agreements whatsoever. However, you will notice that the program is
8 copyrighted by me. This is to assure the program does *not* fall
9 into the public domain. Thus, I may specify just what I am now:
10 This program may be freely copied and distributed, provided this notice
11 remains; it may not be sold for profit without express written consent of
12 the author.
13 Please note that I recreated the behavior of the Unix(Tm) 'cut' command
14 as faithfully as possible; however, I haven't run a full set of regression
15 tests. Thus, the user of this program accepts full responsibility for any
16 effects or loss; in particular, the author is not responsible for any losses,
17 explicit or incidental, that may be incurred through use of this program.
19 I ask that any bugs (and, if possible, fixes) be reported to me when
20 possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us
22 POSIX changes, bug fixes, long-named options, and cleanup
23 by David MacKenzie <djm@gnu.ai.mit.edu>.
25 Rewrite cut_fields and cut_bytes -- Jim Meyering (meyering@comco.com).
27 Options:
28 --bytes=byte-list
29 -b byte-list Print only the bytes in positions listed
30 in BYTE-LIST.
31 Tabs and backspaces are treated like any
32 other character; they take up 1 byte.
34 --characters=character-list
35 -c character-list Print only characters in positions listed
36 in CHARACTER-LIST.
37 The same as -b for now, but
38 internationalization will change that.
39 Tabs and backspaces are treated like any
40 other character; they take up 1 character.
42 --fields=field-list
43 -f field-list Print only the fields listed in FIELD-LIST.
44 Fields are separated by a TAB by default.
46 --delimiter=delim
47 -d delim For -f, fields are separated by the first
48 character in DELIM instead of TAB.
50 -n Do not split multibyte chars (no-op for now).
52 --only-delimited
53 -s For -f, do not print lines that do not contain
54 the field separator character.
56 The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers
57 or ranges separated by commas. The first byte, character, and field
58 are numbered 1.
60 A FILE of `-' means standard input. */
62 #include <config.h>
64 /* Get isblank from GNU libc. */
65 #define _GNU_SOURCE
67 #include <stdio.h>
69 #define NDEBUG
70 #include <assert.h>
72 #include <getopt.h>
73 #include <sys/types.h>
74 #include "system.h"
75 #include "version.h"
76 #include "error.h"
78 #define FATAL_ERROR(s) \
79 do \
80 { \
81 error (0, 0, (s)); \
82 usage (2); \
83 } \
84 while (0)
86 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
87 space if necessary. Update local variable N_RP. When allocating,
88 update global variable N_RP_ALLOCATED. */
90 #define ADD_RANGE_PAIR(rp, low, high) \
91 do \
92 { \
93 if (n_rp >= n_rp_allocated) \
94 { \
95 n_rp_allocated *= 2; \
96 (rp) = (struct range_pair *) xrealloc ((rp), \
97 n_rp_allocated * sizeof (*(rp))); \
98 } \
99 rp[n_rp].lo = (low); \
100 rp[n_rp].hi = (high); \
101 ++n_rp; \
103 while (0)
105 struct range_pair
107 int lo;
108 int hi;
111 char *xmalloc ();
112 char *xrealloc ();
114 /* This buffer is used to support the semantics of the -s option
115 (or lack of same) when the specified field list includes (does
116 not include) the first field. In both of those cases, the entire
117 first field must be read into this buffer to determine whether it
118 is followed by a delimiter or a newline before any of it may be
119 output. Otherwise, cut_fields can do the job without using this
120 buffer. */
121 static char *field_1_buffer;
123 /* The number of bytes allocated for FIELD_1_BUFFER. */
124 static int field_1_bufsize;
126 /* The largest field or byte index used as an endpoint of a closed
127 or degenerate range specification; this doesn't include the starting
128 index of right-open-ended ranges. For example, with either range spec
129 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
130 static int max_range_endpoint;
132 /* If nonzero, this is the index of the first field in a range that goes
133 to end of line. */
134 static int eol_range_start;
136 /* In byte mode, which bytes to output.
137 In field mode, which DELIM-separated fields to output.
138 Both bytes and fields are numbered starting with 1,
139 so the zeroth element of this array is unused.
140 A field or byte K has been selected if
141 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
142 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
143 static int *printable_field;
145 enum operating_mode
147 undefined_mode,
149 /* Output characters that are in the given bytes. */
150 byte_mode,
152 /* Output the given delimeter-separated fields. */
153 field_mode
156 /* The name this program was run with. */
157 char *program_name;
159 static enum operating_mode operating_mode;
161 /* If non-zero do not output lines containing no delimeter characters.
162 Otherwise, all such lines are printed. This option is valid only
163 with field mode. */
164 static int suppress_non_delimited;
166 /* The delimeter character for field mode. */
167 static int delim;
169 /* Nonzero if we have ever read standard input. */
170 static int have_read_stdin;
172 /* If non-zero, display usage information and exit. */
173 static int show_help;
175 /* If non-zero, print the version on standard output then exit. */
176 static int show_version;
178 static struct option const longopts[] =
180 {"bytes", required_argument, 0, 'b'},
181 {"characters", required_argument, 0, 'c'},
182 {"fields", required_argument, 0, 'f'},
183 {"delimiter", required_argument, 0, 'd'},
184 {"only-delimited", no_argument, 0, 's'},
185 {"help", no_argument, &show_help, 1},
186 {"version", no_argument, &show_version, 1},
187 {0, 0, 0, 0}
190 static void
191 usage (status)
192 int status;
194 if (status != 0)
195 fprintf (stderr, "Try `%s --help' for more information.\n",
196 program_name);
197 else
199 printf ("\
200 Usage: %s [OPTION]... [FILE]...\n\
202 program_name);
203 printf ("\
204 Print selected parts of lines from each FILE to standard output.\n\
206 -b, --bytes=LIST output only these bytes\n\
207 -c, --characters=LIST output only these characters\n\
208 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
209 -f, --fields=LIST output only these fields\n\
210 -n (ignored)\n\
211 -s, --only-delimited do not print lines not containing delimiters\n\
212 --help display this help and exit\n\
213 --version output version information and exit\n\
215 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
216 range, or many ranges separated by commas. Each range is one of:\n\
218 N N'th byte, character or field, counted from 1\n\
219 N- from N'th byte, character or field, to end of line\n\
220 N-M from N'th to M'th (included) byte, character or field\n\
221 -M from first to M'th (included) byte, character or field\n\
223 With no FILE, or when FILE is -, read standard input.\n\
226 exit (status);
229 /* The following function was copied from getline.c, but with these changes:
230 - Read up to and including a newline or TERMINATOR, whichever comes first.
231 The original does not treat newline specially.
232 - Remove unused argument, OFFSET.
233 - Use xmalloc and xrealloc instead of malloc and realloc.
234 - Declare this function static. */
236 /* Always add at least this many bytes when extending the buffer. */
237 #define MIN_CHUNK 64
239 /* Read up to (and including) a newline or TERMINATOR from STREAM into
240 *LINEPTR (and null-terminate it). *LINEPTR is a pointer returned from
241 xmalloc (or NULL), pointing to *N characters of space. It is
242 xrealloc'd as necessary. Return the number of characters read (not
243 including the null terminator), or -1 on error or EOF. */
245 static int
246 getstr (lineptr, n, stream, terminator)
247 char **lineptr;
248 int *n;
249 FILE *stream;
250 char terminator;
252 int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
253 char *read_pos; /* Where we're reading into *LINEPTR. */
255 if (!lineptr || !n || !stream)
256 return -1;
258 if (!*lineptr)
260 *n = MIN_CHUNK;
261 *lineptr = xmalloc (*n);
262 if (!*lineptr)
263 return -1;
266 nchars_avail = *n;
267 read_pos = *lineptr;
269 for (;;)
271 register int c = getc (stream);
273 /* We always want at least one char left in the buffer, since we
274 always (unless we get an error while reading the first char)
275 NUL-terminate the line buffer. */
277 assert (*n - nchars_avail == read_pos - *lineptr);
278 if (nchars_avail < 1)
280 if (*n > MIN_CHUNK)
281 *n *= 2;
282 else
283 *n += MIN_CHUNK;
285 nchars_avail = *n + *lineptr - read_pos;
286 *lineptr = xrealloc (*lineptr, *n);
287 if (!*lineptr)
288 return -1;
289 read_pos = *n - nchars_avail + *lineptr;
290 assert (*n - nchars_avail == read_pos - *lineptr);
293 if (feof (stream) || ferror (stream))
295 /* Return partial line, if any. */
296 if (read_pos == *lineptr)
297 return -1;
298 else
299 break;
302 *read_pos++ = c;
303 nchars_avail--;
305 if (c == terminator || c == '\n')
306 /* Return the line. */
307 break;
310 /* Done - NUL terminate and return the number of chars read. */
311 *read_pos = '\0';
313 return read_pos - *lineptr;
316 static int
317 print_kth (k)
318 int k;
320 return ((eol_range_start > 0 && eol_range_start <= k)
321 || (k <= max_range_endpoint && printable_field[k]));
324 /* Given the list of field or byte range specifications FIELDSTR, set
325 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
326 array. If there is a right-open-ended range, set EOL_RANGE_START
327 to its starting index. FIELDSTR should be composed of one or more
328 numbers or ranges of numbers, separated by blanks or commas.
329 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
330 through end of line. Return non-zero if FIELDSTR contains at least
331 one field specification, zero otherwise. */
333 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
334 of some huge input file? This function shouldn't have to alloate a table
335 of a million ints just so we can test every field < 10^6 with an array
336 dereference. Instead, consider using a dynamic hash table. It would be
337 simpler and nearly as good a solution to use a 32K x 4-byte table with
338 one bit per field index instead of a whole `int' per index. */
340 static int
341 set_fields (fieldstr)
342 const char *fieldstr;
344 int initial = 1; /* Value of first number in a range. */
345 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
346 int value = 0; /* If nonzero, a number being accumulated. */
347 int field_found = 0; /* Non-zero if at least one field spec
348 has been processed. */
350 struct range_pair *rp;
351 unsigned int n_rp;
352 unsigned int n_rp_allocated;
353 unsigned int i;
355 n_rp = 0;
356 n_rp_allocated = 16;
357 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
359 /* Collect and store in RP the range end points.
360 It also sets EOL_RANGE_START if appropriate. */
362 for (;;)
364 if (*fieldstr == '-')
366 /* Starting a range. */
367 if (dash_found)
368 FATAL_ERROR ("invalid byte or field list");
369 dash_found++;
370 fieldstr++;
372 if (value)
374 initial = value;
375 value = 0;
377 else
378 initial = 1;
380 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
382 /* Ending the string, or this field/byte sublist. */
383 if (dash_found)
385 dash_found = 0;
387 /* A range. Possibilites: -n, m-n, n-.
388 In any case, `initial' contains the start of the range. */
389 if (value == 0)
391 /* `n-'. From `initial' to end of line. */
392 eol_range_start = initial;
393 field_found = 1;
395 else
397 /* `m-n' or `-n' (1-n). */
398 if (value < initial)
399 FATAL_ERROR ("invalid byte or field list");
401 /* Is there already a range going to end of line? */
402 if (eol_range_start != 0)
404 /* Yes. Is the new sequence already contained
405 in the old one? If so, no processing is
406 necessary. */
407 if (initial < eol_range_start)
409 /* No, the new sequence starts before the
410 old. Does the old range going to end of line
411 extend into the new range? */
412 if (value >= eol_range_start - 1)
414 /* Yes. Simply move the end of line marker. */
415 eol_range_start = initial;
417 else
419 /* No. A simple range, before and disjoint from
420 the range going to end of line. Fill it. */
421 ADD_RANGE_PAIR (rp, initial, value);
424 /* In any case, some fields were selected. */
425 field_found = 1;
428 else
430 /* There is no range going to end of line. */
431 ADD_RANGE_PAIR (rp, initial, value);
432 field_found = 1;
434 value = 0;
437 else if (value != 0)
439 /* A simple field number, not a range. */
440 ADD_RANGE_PAIR (rp, value, value);
441 value = 0;
442 field_found = 1;
445 if (*fieldstr == '\0')
447 break;
450 fieldstr++;
452 else if (ISDIGIT (*fieldstr))
454 /* FIXME: detect overflow? */
455 value = 10 * value + *fieldstr - '0';
456 fieldstr++;
458 else
459 FATAL_ERROR ("invalid byte or field list");
462 max_range_endpoint = 0;
463 for (i = 0; i < n_rp; i++)
465 if (rp[i].hi > max_range_endpoint)
466 max_range_endpoint = rp[i].hi;
469 /* Allocate an array large enough so that it may be indexed by
470 the field numbers corresponding to all finite ranges
471 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
473 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
474 for (i = 1; i <= max_range_endpoint; i++)
475 printable_field[i] = 0;
477 /* Set the array entries corresponding to integers in the ranges of RP. */
478 for (i = 0; i < n_rp; i++)
480 int j;
481 for (j = rp[i].lo; j <= rp[i].hi; j++)
483 printable_field[j] = 1;
487 free (rp);
489 return field_found;
492 /* Read from stream STREAM, printing to standard output any selected bytes. */
494 static void
495 cut_bytes (stream)
496 FILE *stream;
498 int byte_idx; /* Number of chars in the line so far. */
500 byte_idx = 0;
501 while (1)
503 register int c; /* Each character from the file. */
505 c = getc (stream);
507 if (c == '\n')
509 putchar ('\n');
510 byte_idx = 0;
512 else if (c == EOF)
514 if (byte_idx > 0)
515 putchar ('\n');
516 break;
518 else
520 ++byte_idx;
521 if (print_kth (byte_idx))
523 putchar (c);
529 /* Read from stream STREAM, printing to standard output any selected fields. */
531 static void
532 cut_fields (stream)
533 FILE *stream;
535 int c;
536 int field_idx;
537 int found_any_selected_field;
538 int buffer_first_field;
540 found_any_selected_field = 0;
541 field_idx = 1;
543 /* To support the semantics of the -s flag, we may have to buffer
544 all of the first field to determine whether it is `delimited.'
545 But that is unnecessary if all non-delimited lines must be printed
546 and the first field has been selected, or if non-delimited lines
547 must be suppressed and the first field has *not* been selected.
548 That is because a non-delimited line has exactly one field. */
549 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
551 while (1)
553 if (field_idx == 1 && buffer_first_field)
555 int len;
557 len = getstr (&field_1_buffer, &field_1_bufsize, stream, delim);
558 if (len < 0)
559 break;
561 assert (len != 0);
563 /* If the first field extends to the end of line (it is not
564 delimited) and we are printing all non-delimited lines,
565 print this one. */
566 if (field_1_buffer[len - 1] != delim)
568 if (suppress_non_delimited)
570 /* Empty. */
572 else
574 fwrite (field_1_buffer, sizeof (char), len, stdout);
575 /* Make sure the output line is newline terminated. */
576 if (field_1_buffer[len - 1] != '\n')
577 putchar ('\n');
579 continue;
581 if (print_kth (1))
583 /* Print the field, but not the trailing delimiter. */
584 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
585 found_any_selected_field = 1;
587 ++field_idx;
590 if (print_kth (field_idx))
592 if (found_any_selected_field)
593 putchar (delim);
594 found_any_selected_field = 1;
596 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
598 putchar (c);
601 else
603 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
605 /* Empty. */
609 if (c == '\n')
611 c = getc (stream);
612 if (c != EOF)
614 ungetc (c, stream);
615 c = '\n';
619 if (c == delim)
620 ++field_idx;
621 else if (c == '\n' || c == EOF)
623 if (found_any_selected_field
624 || !(suppress_non_delimited && field_idx == 1))
625 putchar ('\n');
626 if (c == EOF)
627 break;
628 field_idx = 1;
629 found_any_selected_field = 0;
634 static void
635 cut_stream (stream)
636 FILE *stream;
638 if (operating_mode == byte_mode)
639 cut_bytes (stream);
640 else
641 cut_fields (stream);
644 /* Process file FILE to standard output.
645 Return 0 if successful, 1 if not. */
647 static int
648 cut_file (file)
649 char *file;
651 FILE *stream;
653 if (!strcmp (file, "-"))
655 have_read_stdin = 1;
656 stream = stdin;
658 else
660 stream = fopen (file, "r");
661 if (stream == NULL)
663 error (0, errno, "%s", file);
664 return 1;
668 cut_stream (stream);
670 if (ferror (stream))
672 error (0, errno, "%s", file);
673 return 1;
675 if (!strcmp (file, "-"))
676 clearerr (stream); /* Also clear EOF. */
677 else if (fclose (stream) == EOF)
679 error (0, errno, "%s", file);
680 return 1;
682 return 0;
685 void
686 main (argc, argv)
687 int argc;
688 char **argv;
690 int optc, exit_status = 0;
692 program_name = argv[0];
694 operating_mode = undefined_mode;
696 /* By default, all non-delimited lines are printed. */
697 suppress_non_delimited = 0;
699 delim = '\0';
700 have_read_stdin = 0;
702 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, (int *) 0))
703 != EOF)
705 switch (optc)
707 case 0:
708 break;
710 case 'b':
711 case 'c':
712 /* Build the byte list. */
713 if (operating_mode != undefined_mode)
714 FATAL_ERROR ("only one type of list may be specified");
715 operating_mode = byte_mode;
716 if (set_fields (optarg) == 0)
717 FATAL_ERROR ("missing list of positions");
718 break;
720 case 'f':
721 /* Build the field list. */
722 if (operating_mode != undefined_mode)
723 FATAL_ERROR ("only one type of list may be specified");
724 operating_mode = field_mode;
725 if (set_fields (optarg) == 0)
726 FATAL_ERROR ("missing list of fields");
727 break;
729 case 'd':
730 /* New delimiter. */
731 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
732 if (optarg[0] != '\0' && optarg[1] != '\0')
733 FATAL_ERROR ("the delimiter must be a single character");
734 delim = optarg[0];
735 break;
737 case 'n':
738 break;
740 case 's':
741 suppress_non_delimited = 1;
742 break;
744 default:
745 usage (2);
749 if (show_version)
751 printf ("cut - %s\n", version_string);
752 exit (0);
755 if (show_help)
756 usage (0);
758 if (operating_mode == undefined_mode)
759 FATAL_ERROR ("you must specify a list of bytes, characters, or fields");
761 if (delim != '\0' && operating_mode != field_mode)
762 FATAL_ERROR ("a delimiter may be specified only when operating on fields");
764 if (suppress_non_delimited && operating_mode != field_mode)
765 FATAL_ERROR ("suppressing non-delimited lines makes sense\n\
766 \tonly when operating on fields");
768 if (delim == '\0')
769 delim = '\t';
771 if (optind == argc)
772 exit_status |= cut_file ("-");
773 else
774 for (; optind < argc; optind++)
775 exit_status |= cut_file (argv[optind]);
777 if (have_read_stdin && fclose (stdin) == EOF)
779 error (0, errno, "-");
780 exit_status = 1;
782 if (ferror (stdout) || fclose (stdout) == EOF)
783 error (1, errno, "write error");
785 exit (exit_status);