(main): Declare to be of type int, not void.
[coreutils.git] / src / cut.c
blob63490cee8f7e529c0e4db8504039bd5736f8192d
1 /* cut - remove parts of lines of files
2 Copyright (C) 1984 by David M. Ihnat
4 This program is a total rewrite of the Bell Laboratories Unix(Tm)
5 command of the same name, as of System V. It contains no proprietary
6 code, and therefore may be used without violation of any proprietary
7 agreements whatsoever. However, you will notice that the program is
8 copyrighted by me. This is to assure the program does *not* fall
9 into the public domain. Thus, I may specify just what I am now:
10 This program may be freely copied and distributed, provided this notice
11 remains; it may not be sold for profit without express written consent of
12 the author.
13 Please note that I recreated the behavior of the Unix(Tm) 'cut' command
14 as faithfully as possible; however, I haven't run a full set of regression
15 tests. Thus, the user of this program accepts full responsibility for any
16 effects or loss; in particular, the author is not responsible for any losses,
17 explicit or incidental, that may be incurred through use of this program.
19 I ask that any bugs (and, if possible, fixes) be reported to me when
20 possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us
22 POSIX changes, bug fixes, long-named options, and cleanup
23 by David MacKenzie <djm@gnu.ai.mit.edu>.
25 Rewrite cut_fields and cut_bytes -- Jim Meyering (meyering@comco.com).
27 Options:
28 --bytes=byte-list
29 -b byte-list Print only the bytes in positions listed
30 in BYTE-LIST.
31 Tabs and backspaces are treated like any
32 other character; they take up 1 byte.
34 --characters=character-list
35 -c character-list Print only characters in positions listed
36 in CHARACTER-LIST.
37 The same as -b for now, but
38 internationalization will change that.
39 Tabs and backspaces are treated like any
40 other character; they take up 1 character.
42 --fields=field-list
43 -f field-list Print only the fields listed in FIELD-LIST.
44 Fields are separated by a TAB by default.
46 --delimiter=delim
47 -d delim For -f, fields are separated by the first
48 character in DELIM instead of TAB.
50 -n Do not split multibyte chars (no-op for now).
52 --only-delimited
53 -s For -f, do not print lines that do not contain
54 the field separator character.
56 The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers
57 or ranges separated by commas. The first byte, character, and field
58 are numbered 1.
60 A FILE of `-' means standard input. */
62 #include <config.h>
64 /* Get isblank from GNU libc. */
65 #define _GNU_SOURCE
67 #include <stdio.h>
69 #define NDEBUG
70 #include <assert.h>
72 #include <getopt.h>
73 #include <sys/types.h>
74 #include "system.h"
75 #include "error.h"
77 #define FATAL_ERROR(s) \
78 do \
79 { \
80 error (0, 0, (s)); \
81 usage (2); \
82 } \
83 while (0)
85 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
86 space if necessary. Update local variable N_RP. When allocating,
87 update global variable N_RP_ALLOCATED. */
89 #define ADD_RANGE_PAIR(rp, low, high) \
90 do \
91 { \
92 if (n_rp >= n_rp_allocated) \
93 { \
94 n_rp_allocated *= 2; \
95 (rp) = (struct range_pair *) xrealloc ((rp), \
96 n_rp_allocated * sizeof (*(rp))); \
97 } \
98 rp[n_rp].lo = (low); \
99 rp[n_rp].hi = (high); \
100 ++n_rp; \
102 while (0)
104 struct range_pair
106 unsigned int lo;
107 unsigned int hi;
110 char *xmalloc ();
111 char *xrealloc ();
113 /* This buffer is used to support the semantics of the -s option
114 (or lack of same) when the specified field list includes (does
115 not include) the first field. In both of those cases, the entire
116 first field must be read into this buffer to determine whether it
117 is followed by a delimiter or a newline before any of it may be
118 output. Otherwise, cut_fields can do the job without using this
119 buffer. */
120 static char *field_1_buffer;
122 /* The number of bytes allocated for FIELD_1_BUFFER. */
123 static int field_1_bufsize;
125 /* The largest field or byte index used as an endpoint of a closed
126 or degenerate range specification; this doesn't include the starting
127 index of right-open-ended ranges. For example, with either range spec
128 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
129 static unsigned int max_range_endpoint;
131 /* If nonzero, this is the index of the first field in a range that goes
132 to end of line. */
133 static unsigned int eol_range_start;
135 /* In byte mode, which bytes to output.
136 In field mode, which DELIM-separated fields to output.
137 Both bytes and fields are numbered starting with 1,
138 so the zeroth element of this array is unused.
139 A field or byte K has been selected if
140 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
141 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
142 static int *printable_field;
144 enum operating_mode
146 undefined_mode,
148 /* Output characters that are in the given bytes. */
149 byte_mode,
151 /* Output the given delimeter-separated fields. */
152 field_mode
155 /* The name this program was run with. */
156 char *program_name;
158 static enum operating_mode operating_mode;
160 /* If nonzero do not output lines containing no delimeter characters.
161 Otherwise, all such lines are printed. This option is valid only
162 with field mode. */
163 static int suppress_non_delimited;
165 /* The delimeter character for field mode. */
166 static int delim;
168 /* Nonzero if we have ever read standard input. */
169 static int have_read_stdin;
171 /* If nonzero, display usage information and exit. */
172 static int show_help;
174 /* If nonzero, print the version on standard output then exit. */
175 static int show_version;
177 static struct option const longopts[] =
179 {"bytes", required_argument, 0, 'b'},
180 {"characters", required_argument, 0, 'c'},
181 {"fields", required_argument, 0, 'f'},
182 {"delimiter", required_argument, 0, 'd'},
183 {"only-delimited", no_argument, 0, 's'},
184 {"help", no_argument, &show_help, 1},
185 {"version", no_argument, &show_version, 1},
186 {0, 0, 0, 0}
189 static void
190 usage (int status)
192 if (status != 0)
193 fprintf (stderr, _("Try `%s --help' for more information.\n"),
194 program_name);
195 else
197 printf (_("\
198 Usage: %s [OPTION]... [FILE]...\n\
200 program_name);
201 printf (_("\
202 Print selected parts of lines from each FILE to standard output.\n\
204 -b, --bytes=LIST output only these bytes\n\
205 -c, --characters=LIST output only these characters\n\
206 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
207 -f, --fields=LIST output only these fields\n\
208 -n (ignored)\n\
209 -s, --only-delimited do not print lines not containing delimiters\n\
210 --help display this help and exit\n\
211 --version output version information and exit\n\
213 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
214 range, or many ranges separated by commas. Each range is one of:\n\
216 N N'th byte, character or field, counted from 1\n\
217 N- from N'th byte, character or field, to end of line\n\
218 N-M from N'th to M'th (included) byte, character or field\n\
219 -M from first to M'th (included) byte, character or field\n\
221 With no FILE, or when FILE is -, read standard input.\n\
222 "));
224 exit (status);
227 /* The following function was copied from getline.c, but with these changes:
228 - Read up to and including a newline or TERMINATOR, whichever comes first.
229 The original does not treat newline specially.
230 - Remove unused argument, OFFSET.
231 - Use xmalloc and xrealloc instead of malloc and realloc.
232 - Declare this function static. */
234 /* Always add at least this many bytes when extending the buffer. */
235 #define MIN_CHUNK 64
237 /* Read up to (and including) a newline or TERMINATOR from STREAM into
238 *LINEPTR (and null-terminate it). *LINEPTR is a pointer returned from
239 xmalloc (or NULL), pointing to *N characters of space. It is
240 xrealloc'd as necessary. Return the number of characters read (not
241 including the null terminator), or -1 on error or EOF. */
243 static int
244 getstr (char **lineptr, int *n, FILE *stream, char terminator)
246 int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
247 char *read_pos; /* Where we're reading into *LINEPTR. */
249 if (!lineptr || !n || !stream)
250 return -1;
252 if (!*lineptr)
254 *n = MIN_CHUNK;
255 *lineptr = xmalloc (*n);
256 if (!*lineptr)
257 return -1;
260 nchars_avail = *n;
261 read_pos = *lineptr;
263 for (;;)
265 register int c = getc (stream);
267 /* We always want at least one char left in the buffer, since we
268 always (unless we get an error while reading the first char)
269 NUL-terminate the line buffer. */
271 assert (*n - nchars_avail == read_pos - *lineptr);
272 if (nchars_avail < 1)
274 if (*n > MIN_CHUNK)
275 *n *= 2;
276 else
277 *n += MIN_CHUNK;
279 nchars_avail = *n + *lineptr - read_pos;
280 *lineptr = xrealloc (*lineptr, *n);
281 if (!*lineptr)
282 return -1;
283 read_pos = *n - nchars_avail + *lineptr;
284 assert (*n - nchars_avail == read_pos - *lineptr);
287 if (feof (stream) || ferror (stream))
289 /* Return partial line, if any. */
290 if (read_pos == *lineptr)
291 return -1;
292 else
293 break;
296 *read_pos++ = c;
297 nchars_avail--;
299 if (c == terminator || c == '\n')
300 /* Return the line. */
301 break;
304 /* Done - NUL terminate and return the number of chars read. */
305 *read_pos = '\0';
307 return read_pos - *lineptr;
310 static int
311 print_kth (unsigned int k)
313 return ((0 < eol_range_start && eol_range_start <= k)
314 || (k <= max_range_endpoint && printable_field[k]));
317 /* Given the list of field or byte range specifications FIELDSTR, set
318 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
319 array. If there is a right-open-ended range, set EOL_RANGE_START
320 to its starting index. FIELDSTR should be composed of one or more
321 numbers or ranges of numbers, separated by blanks or commas.
322 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
323 through end of line. Return nonzero if FIELDSTR contains at least
324 one field specification, zero otherwise. */
326 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
327 of some huge input file? This function shouldn't have to alloate a table
328 of a million ints just so we can test every field < 10^6 with an array
329 dereference. Instead, consider using a dynamic hash table. It would be
330 simpler and nearly as good a solution to use a 32K x 4-byte table with
331 one bit per field index instead of a whole `int' per index. */
333 static int
334 set_fields (const char *fieldstr)
336 unsigned int initial = 1; /* Value of first number in a range. */
337 unsigned int value = 0; /* If nonzero, a number being accumulated. */
338 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
339 int field_found = 0; /* Non-zero if at least one field spec
340 has been processed. */
342 struct range_pair *rp;
343 unsigned int n_rp;
344 unsigned int n_rp_allocated;
345 unsigned int i;
347 n_rp = 0;
348 n_rp_allocated = 16;
349 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
351 /* Collect and store in RP the range end points.
352 It also sets EOL_RANGE_START if appropriate. */
354 for (;;)
356 if (*fieldstr == '-')
358 /* Starting a range. */
359 if (dash_found)
360 FATAL_ERROR (_("invalid byte or field list"));
361 dash_found++;
362 fieldstr++;
364 if (value)
366 initial = value;
367 value = 0;
369 else
370 initial = 1;
372 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
374 /* Ending the string, or this field/byte sublist. */
375 if (dash_found)
377 dash_found = 0;
379 /* A range. Possibilites: -n, m-n, n-.
380 In any case, `initial' contains the start of the range. */
381 if (value == 0)
383 /* `n-'. From `initial' to end of line. */
384 eol_range_start = initial;
385 field_found = 1;
387 else
389 /* `m-n' or `-n' (1-n). */
390 if (value < initial)
391 FATAL_ERROR (_("invalid byte or field list"));
393 /* Is there already a range going to end of line? */
394 if (eol_range_start != 0)
396 /* Yes. Is the new sequence already contained
397 in the old one? If so, no processing is
398 necessary. */
399 if (initial < eol_range_start)
401 /* No, the new sequence starts before the
402 old. Does the old range going to end of line
403 extend into the new range? */
404 if (value + 1 >= eol_range_start)
406 /* Yes. Simply move the end of line marker. */
407 eol_range_start = initial;
409 else
411 /* No. A simple range, before and disjoint from
412 the range going to end of line. Fill it. */
413 ADD_RANGE_PAIR (rp, initial, value);
416 /* In any case, some fields were selected. */
417 field_found = 1;
420 else
422 /* There is no range going to end of line. */
423 ADD_RANGE_PAIR (rp, initial, value);
424 field_found = 1;
426 value = 0;
429 else if (value != 0)
431 /* A simple field number, not a range. */
432 ADD_RANGE_PAIR (rp, value, value);
433 value = 0;
434 field_found = 1;
437 if (*fieldstr == '\0')
439 break;
442 fieldstr++;
444 else if (ISDIGIT (*fieldstr))
446 /* FIXME: detect overflow? */
447 value = 10 * value + *fieldstr - '0';
448 fieldstr++;
450 else
451 FATAL_ERROR (_("invalid byte or field list"));
454 max_range_endpoint = 0;
455 for (i = 0; i < n_rp; i++)
457 if (rp[i].hi > max_range_endpoint)
458 max_range_endpoint = rp[i].hi;
461 /* Allocate an array large enough so that it may be indexed by
462 the field numbers corresponding to all finite ranges
463 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
465 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
466 memset (printable_field, 0, (max_range_endpoint + 1) * sizeof (int));
468 /* Set the array entries corresponding to integers in the ranges of RP. */
469 for (i = 0; i < n_rp; i++)
471 unsigned int j;
472 for (j = rp[i].lo; j <= rp[i].hi; j++)
474 printable_field[j] = 1;
478 free (rp);
480 return field_found;
483 /* Read from stream STREAM, printing to standard output any selected bytes. */
485 static void
486 cut_bytes (FILE *stream)
488 unsigned int byte_idx; /* Number of chars in the line so far. */
490 byte_idx = 0;
491 while (1)
493 register int c; /* Each character from the file. */
495 c = getc (stream);
497 if (c == '\n')
499 putchar ('\n');
500 byte_idx = 0;
502 else if (c == EOF)
504 if (byte_idx > 0)
505 putchar ('\n');
506 break;
508 else
510 ++byte_idx;
511 if (print_kth (byte_idx))
513 putchar (c);
519 /* Read from stream STREAM, printing to standard output any selected fields. */
521 static void
522 cut_fields (FILE *stream)
524 int c;
525 unsigned int field_idx;
526 int found_any_selected_field;
527 int buffer_first_field;
529 found_any_selected_field = 0;
530 field_idx = 1;
532 /* To support the semantics of the -s flag, we may have to buffer
533 all of the first field to determine whether it is `delimited.'
534 But that is unnecessary if all non-delimited lines must be printed
535 and the first field has been selected, or if non-delimited lines
536 must be suppressed and the first field has *not* been selected.
537 That is because a non-delimited line has exactly one field. */
538 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
540 while (1)
542 if (field_idx == 1 && buffer_first_field)
544 int len;
546 len = getstr (&field_1_buffer, &field_1_bufsize, stream, delim);
547 if (len < 0)
548 break;
550 assert (len != 0);
552 /* If the first field extends to the end of line (it is not
553 delimited) and we are printing all non-delimited lines,
554 print this one. */
555 if (field_1_buffer[len - 1] != delim)
557 if (suppress_non_delimited)
559 /* Empty. */
561 else
563 fwrite (field_1_buffer, sizeof (char), len, stdout);
564 /* Make sure the output line is newline terminated. */
565 if (field_1_buffer[len - 1] != '\n')
566 putchar ('\n');
568 continue;
570 if (print_kth (1))
572 /* Print the field, but not the trailing delimiter. */
573 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
574 found_any_selected_field = 1;
576 ++field_idx;
579 if (print_kth (field_idx))
581 if (found_any_selected_field)
582 putchar (delim);
583 found_any_selected_field = 1;
585 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
587 putchar (c);
590 else
592 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
594 /* Empty. */
598 if (c == '\n')
600 c = getc (stream);
601 if (c != EOF)
603 ungetc (c, stream);
604 c = '\n';
608 if (c == delim)
609 ++field_idx;
610 else if (c == '\n' || c == EOF)
612 if (found_any_selected_field
613 || !(suppress_non_delimited && field_idx == 1))
614 putchar ('\n');
615 if (c == EOF)
616 break;
617 field_idx = 1;
618 found_any_selected_field = 0;
623 static void
624 cut_stream (FILE *stream)
626 if (operating_mode == byte_mode)
627 cut_bytes (stream);
628 else
629 cut_fields (stream);
632 /* Process file FILE to standard output.
633 Return 0 if successful, 1 if not. */
635 static int
636 cut_file (char *file)
638 FILE *stream;
640 if (!strcmp (file, "-"))
642 have_read_stdin = 1;
643 stream = stdin;
645 else
647 stream = fopen (file, "r");
648 if (stream == NULL)
650 error (0, errno, "%s", file);
651 return 1;
655 cut_stream (stream);
657 if (ferror (stream))
659 error (0, errno, "%s", file);
660 return 1;
662 if (!strcmp (file, "-"))
663 clearerr (stream); /* Also clear EOF. */
664 else if (fclose (stream) == EOF)
666 error (0, errno, "%s", file);
667 return 1;
669 return 0;
673 main (int argc, char **argv)
675 int optc, exit_status = 0;
677 program_name = argv[0];
678 setlocale (LC_ALL, "");
679 bindtextdomain (PACKAGE, LOCALEDIR);
680 textdomain (PACKAGE);
682 operating_mode = undefined_mode;
684 /* By default, all non-delimited lines are printed. */
685 suppress_non_delimited = 0;
687 delim = '\0';
688 have_read_stdin = 0;
690 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, (int *) 0))
691 != EOF)
693 switch (optc)
695 case 0:
696 break;
698 case 'b':
699 case 'c':
700 /* Build the byte list. */
701 if (operating_mode != undefined_mode)
702 FATAL_ERROR (_("only one type of list may be specified"));
703 operating_mode = byte_mode;
704 if (set_fields (optarg) == 0)
705 FATAL_ERROR (_("missing list of positions"));
706 break;
708 case 'f':
709 /* Build the field list. */
710 if (operating_mode != undefined_mode)
711 FATAL_ERROR (_("only one type of list may be specified"));
712 operating_mode = field_mode;
713 if (set_fields (optarg) == 0)
714 FATAL_ERROR (_("missing list of fields"));
715 break;
717 case 'd':
718 /* New delimiter. */
719 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
720 if (optarg[0] != '\0' && optarg[1] != '\0')
721 FATAL_ERROR (_("the delimiter must be a single character"));
722 delim = optarg[0];
723 break;
725 case 'n':
726 break;
728 case 's':
729 suppress_non_delimited = 1;
730 break;
732 default:
733 usage (2);
737 if (show_version)
739 printf ("cut - %s\n", PACKAGE_VERSION);
740 exit (0);
743 if (show_help)
744 usage (0);
746 if (operating_mode == undefined_mode)
747 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
749 if (delim != '\0' && operating_mode != field_mode)
750 FATAL_ERROR (_("a delimiter may be specified only when operating on fields"));
752 if (suppress_non_delimited && operating_mode != field_mode)
753 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
754 \tonly when operating on fields"));
756 if (delim == '\0')
757 delim = '\t';
759 if (optind == argc)
760 exit_status |= cut_file ("-");
761 else
762 for (; optind < argc; optind++)
763 exit_status |= cut_file (argv[optind]);
765 if (have_read_stdin && fclose (stdin) == EOF)
767 error (0, errno, "-");
768 exit_status = 1;
770 if (ferror (stdout) || fclose (stdout) == EOF)
771 error (1, errno, _("write error"));
773 exit (exit_status);