.
[coreutils.git] / src / cut.c
blobd50ec71c9a4c6852290d8df1657b17a475624bcc
1 /* cut - remove parts of lines of files
2 Copyright (C) 1984 by David M. Ihnat
4 This program is a total rewrite of the Bell Laboratories Unix(Tm)
5 command of the same name, as of System V. It contains no proprietary
6 code, and therefore may be used without violation of any proprietary
7 agreements whatsoever. However, you will notice that the program is
8 copyrighted by me. This is to assure the program does *not* fall
9 into the public domain. Thus, I may specify just what I am now:
10 This program may be freely copied and distributed, provided this notice
11 remains; it may not be sold for profit without express written consent of
12 the author.
13 Please note that I recreated the behavior of the Unix(Tm) 'cut' command
14 as faithfully as possible; however, I haven't run a full set of regression
15 tests. Thus, the user of this program accepts full responsibility for any
16 effects or loss; in particular, the author is not responsible for any losses,
17 explicit or incidental, that may be incurred through use of this program.
19 I ask that any bugs (and, if possible, fixes) be reported to me when
20 possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us
22 POSIX changes, bug fixes, long-named options, and cleanup
23 by David MacKenzie <djm@gnu.ai.mit.edu>.
25 Rewrite cut_fields and cut_bytes -- Jim Meyering (meyering@comco.com).
27 Options:
28 --bytes=byte-list
29 -b byte-list Print only the bytes in positions listed
30 in BYTE-LIST.
31 Tabs and backspaces are treated like any
32 other character; they take up 1 byte.
34 --characters=character-list
35 -c character-list Print only characters in positions listed
36 in CHARACTER-LIST.
37 The same as -b for now, but
38 internationalization will change that.
39 Tabs and backspaces are treated like any
40 other character; they take up 1 character.
42 --fields=field-list
43 -f field-list Print only the fields listed in FIELD-LIST.
44 Fields are separated by a TAB by default.
46 --delimiter=delim
47 -d delim For -f, fields are separated by the first
48 character in DELIM instead of TAB.
50 -n Do not split multibyte chars (no-op for now).
52 --only-delimited
53 -s For -f, do not print lines that do not contain
54 the field separator character.
56 The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers
57 or ranges separated by commas. The first byte, character, and field
58 are numbered 1.
60 A FILE of `-' means standard input. */
62 #include <config.h>
64 /* Get isblank from GNU libc. */
65 #define _GNU_SOURCE
67 #include <stdio.h>
69 #define NDEBUG
70 #include <assert.h>
72 #include <getopt.h>
73 #include <sys/types.h>
74 #include "system.h"
75 #include "version.h"
76 #include "error.h"
78 #define FATAL_ERROR(s) \
79 do \
80 { \
81 error (0, 0, (s)); \
82 usage (2); \
83 } \
84 while (0)
86 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
87 space if necessary. Update local variable N_RP. When allocating,
88 update global variable N_RP_ALLOCATED. */
90 #define ADD_RANGE_PAIR(rp, low, high) \
91 do \
92 { \
93 if (n_rp >= n_rp_allocated) \
94 { \
95 n_rp_allocated *= 2; \
96 (rp) = (struct range_pair *) xrealloc ((rp), \
97 n_rp_allocated * sizeof (*(rp))); \
98 } \
99 rp[n_rp].lo = (low); \
100 rp[n_rp].hi = (high); \
101 ++n_rp; \
103 while (0)
105 struct range_pair
107 unsigned int lo;
108 unsigned int hi;
111 char *xmalloc ();
112 char *xrealloc ();
114 /* This buffer is used to support the semantics of the -s option
115 (or lack of same) when the specified field list includes (does
116 not include) the first field. In both of those cases, the entire
117 first field must be read into this buffer to determine whether it
118 is followed by a delimiter or a newline before any of it may be
119 output. Otherwise, cut_fields can do the job without using this
120 buffer. */
121 static char *field_1_buffer;
123 /* The number of bytes allocated for FIELD_1_BUFFER. */
124 static int field_1_bufsize;
126 /* The largest field or byte index used as an endpoint of a closed
127 or degenerate range specification; this doesn't include the starting
128 index of right-open-ended ranges. For example, with either range spec
129 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
130 static unsigned int max_range_endpoint;
132 /* If nonzero, this is the index of the first field in a range that goes
133 to end of line. */
134 static unsigned int eol_range_start;
136 /* In byte mode, which bytes to output.
137 In field mode, which DELIM-separated fields to output.
138 Both bytes and fields are numbered starting with 1,
139 so the zeroth element of this array is unused.
140 A field or byte K has been selected if
141 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
142 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
143 static int *printable_field;
145 enum operating_mode
147 undefined_mode,
149 /* Output characters that are in the given bytes. */
150 byte_mode,
152 /* Output the given delimeter-separated fields. */
153 field_mode
156 /* The name this program was run with. */
157 char *program_name;
159 static enum operating_mode operating_mode;
161 /* If nonzero do not output lines containing no delimeter characters.
162 Otherwise, all such lines are printed. This option is valid only
163 with field mode. */
164 static int suppress_non_delimited;
166 /* The delimeter character for field mode. */
167 static int delim;
169 /* Nonzero if we have ever read standard input. */
170 static int have_read_stdin;
172 /* If nonzero, display usage information and exit. */
173 static int show_help;
175 /* If nonzero, print the version on standard output then exit. */
176 static int show_version;
178 static struct option const longopts[] =
180 {"bytes", required_argument, 0, 'b'},
181 {"characters", required_argument, 0, 'c'},
182 {"fields", required_argument, 0, 'f'},
183 {"delimiter", required_argument, 0, 'd'},
184 {"only-delimited", no_argument, 0, 's'},
185 {"help", no_argument, &show_help, 1},
186 {"version", no_argument, &show_version, 1},
187 {0, 0, 0, 0}
190 static void
191 usage (int status)
193 if (status != 0)
194 fprintf (stderr, _("Try `%s --help' for more information.\n"),
195 program_name);
196 else
198 printf (_("\
199 Usage: %s [OPTION]... [FILE]...\n\
201 program_name);
202 printf (_("\
203 Print selected parts of lines from each FILE to standard output.\n\
205 -b, --bytes=LIST output only these bytes\n\
206 -c, --characters=LIST output only these characters\n\
207 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
208 -f, --fields=LIST output only these fields\n\
209 -n (ignored)\n\
210 -s, --only-delimited do not print lines not containing delimiters\n\
211 --help display this help and exit\n\
212 --version output version information and exit\n\
214 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
215 range, or many ranges separated by commas. Each range is one of:\n\
217 N N'th byte, character or field, counted from 1\n\
218 N- from N'th byte, character or field, to end of line\n\
219 N-M from N'th to M'th (included) byte, character or field\n\
220 -M from first to M'th (included) byte, character or field\n\
222 With no FILE, or when FILE is -, read standard input.\n\
223 "));
225 exit (status);
228 /* The following function was copied from getline.c, but with these changes:
229 - Read up to and including a newline or TERMINATOR, whichever comes first.
230 The original does not treat newline specially.
231 - Remove unused argument, OFFSET.
232 - Use xmalloc and xrealloc instead of malloc and realloc.
233 - Declare this function static. */
235 /* Always add at least this many bytes when extending the buffer. */
236 #define MIN_CHUNK 64
238 /* Read up to (and including) a newline or TERMINATOR from STREAM into
239 *LINEPTR (and null-terminate it). *LINEPTR is a pointer returned from
240 xmalloc (or NULL), pointing to *N characters of space. It is
241 xrealloc'd as necessary. Return the number of characters read (not
242 including the null terminator), or -1 on error or EOF. */
244 static int
245 getstr (char **lineptr, int *n, FILE *stream, char terminator)
247 int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
248 char *read_pos; /* Where we're reading into *LINEPTR. */
250 if (!lineptr || !n || !stream)
251 return -1;
253 if (!*lineptr)
255 *n = MIN_CHUNK;
256 *lineptr = xmalloc (*n);
257 if (!*lineptr)
258 return -1;
261 nchars_avail = *n;
262 read_pos = *lineptr;
264 for (;;)
266 register int c = getc (stream);
268 /* We always want at least one char left in the buffer, since we
269 always (unless we get an error while reading the first char)
270 NUL-terminate the line buffer. */
272 assert (*n - nchars_avail == read_pos - *lineptr);
273 if (nchars_avail < 1)
275 if (*n > MIN_CHUNK)
276 *n *= 2;
277 else
278 *n += MIN_CHUNK;
280 nchars_avail = *n + *lineptr - read_pos;
281 *lineptr = xrealloc (*lineptr, *n);
282 if (!*lineptr)
283 return -1;
284 read_pos = *n - nchars_avail + *lineptr;
285 assert (*n - nchars_avail == read_pos - *lineptr);
288 if (feof (stream) || ferror (stream))
290 /* Return partial line, if any. */
291 if (read_pos == *lineptr)
292 return -1;
293 else
294 break;
297 *read_pos++ = c;
298 nchars_avail--;
300 if (c == terminator || c == '\n')
301 /* Return the line. */
302 break;
305 /* Done - NUL terminate and return the number of chars read. */
306 *read_pos = '\0';
308 return read_pos - *lineptr;
311 static int
312 print_kth (unsigned int k)
314 return ((0 < eol_range_start && eol_range_start <= k)
315 || (k <= max_range_endpoint && printable_field[k]));
318 /* Given the list of field or byte range specifications FIELDSTR, set
319 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
320 array. If there is a right-open-ended range, set EOL_RANGE_START
321 to its starting index. FIELDSTR should be composed of one or more
322 numbers or ranges of numbers, separated by blanks or commas.
323 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
324 through end of line. Return nonzero if FIELDSTR contains at least
325 one field specification, zero otherwise. */
327 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
328 of some huge input file? This function shouldn't have to alloate a table
329 of a million ints just so we can test every field < 10^6 with an array
330 dereference. Instead, consider using a dynamic hash table. It would be
331 simpler and nearly as good a solution to use a 32K x 4-byte table with
332 one bit per field index instead of a whole `int' per index. */
334 static int
335 set_fields (const char *fieldstr)
337 unsigned int initial = 1; /* Value of first number in a range. */
338 unsigned int value = 0; /* If nonzero, a number being accumulated. */
339 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
340 int field_found = 0; /* Non-zero if at least one field spec
341 has been processed. */
343 struct range_pair *rp;
344 unsigned int n_rp;
345 unsigned int n_rp_allocated;
346 unsigned int i;
348 n_rp = 0;
349 n_rp_allocated = 16;
350 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
352 /* Collect and store in RP the range end points.
353 It also sets EOL_RANGE_START if appropriate. */
355 for (;;)
357 if (*fieldstr == '-')
359 /* Starting a range. */
360 if (dash_found)
361 FATAL_ERROR (_("invalid byte or field list"));
362 dash_found++;
363 fieldstr++;
365 if (value)
367 initial = value;
368 value = 0;
370 else
371 initial = 1;
373 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
375 /* Ending the string, or this field/byte sublist. */
376 if (dash_found)
378 dash_found = 0;
380 /* A range. Possibilites: -n, m-n, n-.
381 In any case, `initial' contains the start of the range. */
382 if (value == 0)
384 /* `n-'. From `initial' to end of line. */
385 eol_range_start = initial;
386 field_found = 1;
388 else
390 /* `m-n' or `-n' (1-n). */
391 if (value < initial)
392 FATAL_ERROR (_("invalid byte or field list"));
394 /* Is there already a range going to end of line? */
395 if (eol_range_start != 0)
397 /* Yes. Is the new sequence already contained
398 in the old one? If so, no processing is
399 necessary. */
400 if (initial < eol_range_start)
402 /* No, the new sequence starts before the
403 old. Does the old range going to end of line
404 extend into the new range? */
405 if (value + 1 >= eol_range_start)
407 /* Yes. Simply move the end of line marker. */
408 eol_range_start = initial;
410 else
412 /* No. A simple range, before and disjoint from
413 the range going to end of line. Fill it. */
414 ADD_RANGE_PAIR (rp, initial, value);
417 /* In any case, some fields were selected. */
418 field_found = 1;
421 else
423 /* There is no range going to end of line. */
424 ADD_RANGE_PAIR (rp, initial, value);
425 field_found = 1;
427 value = 0;
430 else if (value != 0)
432 /* A simple field number, not a range. */
433 ADD_RANGE_PAIR (rp, value, value);
434 value = 0;
435 field_found = 1;
438 if (*fieldstr == '\0')
440 break;
443 fieldstr++;
445 else if (ISDIGIT (*fieldstr))
447 /* FIXME: detect overflow? */
448 value = 10 * value + *fieldstr - '0';
449 fieldstr++;
451 else
452 FATAL_ERROR (_("invalid byte or field list"));
455 max_range_endpoint = 0;
456 for (i = 0; i < n_rp; i++)
458 if (rp[i].hi > max_range_endpoint)
459 max_range_endpoint = rp[i].hi;
462 /* Allocate an array large enough so that it may be indexed by
463 the field numbers corresponding to all finite ranges
464 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
466 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
467 memset (printable_field, 0, (max_range_endpoint + 1) * sizeof (int));
469 /* Set the array entries corresponding to integers in the ranges of RP. */
470 for (i = 0; i < n_rp; i++)
472 unsigned int j;
473 for (j = rp[i].lo; j <= rp[i].hi; j++)
475 printable_field[j] = 1;
479 free (rp);
481 return field_found;
484 /* Read from stream STREAM, printing to standard output any selected bytes. */
486 static void
487 cut_bytes (FILE *stream)
489 unsigned int byte_idx; /* Number of chars in the line so far. */
491 byte_idx = 0;
492 while (1)
494 register int c; /* Each character from the file. */
496 c = getc (stream);
498 if (c == '\n')
500 putchar ('\n');
501 byte_idx = 0;
503 else if (c == EOF)
505 if (byte_idx > 0)
506 putchar ('\n');
507 break;
509 else
511 ++byte_idx;
512 if (print_kth (byte_idx))
514 putchar (c);
520 /* Read from stream STREAM, printing to standard output any selected fields. */
522 static void
523 cut_fields (FILE *stream)
525 int c;
526 unsigned int field_idx;
527 int found_any_selected_field;
528 int buffer_first_field;
530 found_any_selected_field = 0;
531 field_idx = 1;
533 /* To support the semantics of the -s flag, we may have to buffer
534 all of the first field to determine whether it is `delimited.'
535 But that is unnecessary if all non-delimited lines must be printed
536 and the first field has been selected, or if non-delimited lines
537 must be suppressed and the first field has *not* been selected.
538 That is because a non-delimited line has exactly one field. */
539 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
541 while (1)
543 if (field_idx == 1 && buffer_first_field)
545 int len;
547 len = getstr (&field_1_buffer, &field_1_bufsize, stream, delim);
548 if (len < 0)
549 break;
551 assert (len != 0);
553 /* If the first field extends to the end of line (it is not
554 delimited) and we are printing all non-delimited lines,
555 print this one. */
556 if (field_1_buffer[len - 1] != delim)
558 if (suppress_non_delimited)
560 /* Empty. */
562 else
564 fwrite (field_1_buffer, sizeof (char), len, stdout);
565 /* Make sure the output line is newline terminated. */
566 if (field_1_buffer[len - 1] != '\n')
567 putchar ('\n');
569 continue;
571 if (print_kth (1))
573 /* Print the field, but not the trailing delimiter. */
574 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
575 found_any_selected_field = 1;
577 ++field_idx;
580 if (print_kth (field_idx))
582 if (found_any_selected_field)
583 putchar (delim);
584 found_any_selected_field = 1;
586 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
588 putchar (c);
591 else
593 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
595 /* Empty. */
599 if (c == '\n')
601 c = getc (stream);
602 if (c != EOF)
604 ungetc (c, stream);
605 c = '\n';
609 if (c == delim)
610 ++field_idx;
611 else if (c == '\n' || c == EOF)
613 if (found_any_selected_field
614 || !(suppress_non_delimited && field_idx == 1))
615 putchar ('\n');
616 if (c == EOF)
617 break;
618 field_idx = 1;
619 found_any_selected_field = 0;
624 static void
625 cut_stream (FILE *stream)
627 if (operating_mode == byte_mode)
628 cut_bytes (stream);
629 else
630 cut_fields (stream);
633 /* Process file FILE to standard output.
634 Return 0 if successful, 1 if not. */
636 static int
637 cut_file (char *file)
639 FILE *stream;
641 if (!strcmp (file, "-"))
643 have_read_stdin = 1;
644 stream = stdin;
646 else
648 stream = fopen (file, "r");
649 if (stream == NULL)
651 error (0, errno, "%s", file);
652 return 1;
656 cut_stream (stream);
658 if (ferror (stream))
660 error (0, errno, "%s", file);
661 return 1;
663 if (!strcmp (file, "-"))
664 clearerr (stream); /* Also clear EOF. */
665 else if (fclose (stream) == EOF)
667 error (0, errno, "%s", file);
668 return 1;
670 return 0;
673 void
674 main (int argc, char **argv)
676 int optc, exit_status = 0;
678 program_name = argv[0];
679 setlocale (LC_ALL, "");
680 bindtextdomain (PACKAGE, LOCALEDIR);
681 textdomain (PACKAGE);
683 operating_mode = undefined_mode;
685 /* By default, all non-delimited lines are printed. */
686 suppress_non_delimited = 0;
688 delim = '\0';
689 have_read_stdin = 0;
691 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, (int *) 0))
692 != EOF)
694 switch (optc)
696 case 0:
697 break;
699 case 'b':
700 case 'c':
701 /* Build the byte list. */
702 if (operating_mode != undefined_mode)
703 FATAL_ERROR (_("only one type of list may be specified"));
704 operating_mode = byte_mode;
705 if (set_fields (optarg) == 0)
706 FATAL_ERROR (_("missing list of positions"));
707 break;
709 case 'f':
710 /* Build the field list. */
711 if (operating_mode != undefined_mode)
712 FATAL_ERROR (_("only one type of list may be specified"));
713 operating_mode = field_mode;
714 if (set_fields (optarg) == 0)
715 FATAL_ERROR (_("missing list of fields"));
716 break;
718 case 'd':
719 /* New delimiter. */
720 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
721 if (optarg[0] != '\0' && optarg[1] != '\0')
722 FATAL_ERROR (_("the delimiter must be a single character"));
723 delim = optarg[0];
724 break;
726 case 'n':
727 break;
729 case 's':
730 suppress_non_delimited = 1;
731 break;
733 default:
734 usage (2);
738 if (show_version)
740 printf ("cut - %s\n", version_string);
741 exit (0);
744 if (show_help)
745 usage (0);
747 if (operating_mode == undefined_mode)
748 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
750 if (delim != '\0' && operating_mode != field_mode)
751 FATAL_ERROR (_("a delimiter may be specified only when operating on fields"));
753 if (suppress_non_delimited && operating_mode != field_mode)
754 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
755 \tonly when operating on fields"));
757 if (delim == '\0')
758 delim = '\t';
760 if (optind == argc)
761 exit_status |= cut_file ("-");
762 else
763 for (; optind < argc; optind++)
764 exit_status |= cut_file (argv[optind]);
766 if (have_read_stdin && fclose (stdin) == EOF)
768 error (0, errno, "-");
769 exit_status = 1;
771 if (ferror (stdout) || fclose (stdout) == EOF)
772 error (1, errno, _("write error"));
774 exit (exit_status);