*** empty log message ***
[coreutils.git] / src / cut.c
blobac8864dfd97f6a1fc3e208c455fe5f13fb988b89
1 /* cut - remove parts of lines of files
2 Copyright (C) 1984, 1997, 1998, 1999 by David M. Ihnat
4 This program is a total rewrite of the Bell Laboratories Unix(Tm)
5 command of the same name, as of System V. It contains no proprietary
6 code, and therefore may be used without violation of any proprietary
7 agreements whatsoever. However, you will notice that the program is
8 copyrighted by me. This is to assure the program does *not* fall
9 into the public domain. Thus, I may specify just what I am now:
10 This program may be freely copied and distributed, provided this notice
11 remains; it may not be sold for profit without express written consent of
12 the author.
13 Please note that I recreated the behavior of the Unix(Tm) 'cut' command
14 as faithfully as possible; however, I haven't run a full set of regression
15 tests. Thus, the user of this program accepts full responsibility for any
16 effects or loss; in particular, the author is not responsible for any losses,
17 explicit or incidental, that may be incurred through use of this program.
19 I ask that any bugs (and, if possible, fixes) be reported to me when
20 possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us
22 POSIX changes, bug fixes, long-named options, and cleanup
23 by David MacKenzie <djm@gnu.ai.mit.edu>.
25 Rewrite cut_fields and cut_bytes -- Jim Meyering (meyering@comco.com).
27 Options:
28 --bytes=byte-list
29 -b byte-list Print only the bytes in positions listed
30 in BYTE-LIST.
31 Tabs and backspaces are treated like any
32 other character; they take up 1 byte.
34 --characters=character-list
35 -c character-list Print only characters in positions listed
36 in CHARACTER-LIST.
37 The same as -b for now, but
38 internationalization will change that.
39 Tabs and backspaces are treated like any
40 other character; they take up 1 character.
42 --fields=field-list
43 -f field-list Print only the fields listed in FIELD-LIST.
44 Fields are separated by a TAB by default.
46 --delimiter=delim
47 -d delim For -f, fields are separated by the first
48 character in DELIM instead of TAB.
50 -n Do not split multibyte chars (no-op for now).
52 --only-delimited
53 -s For -f, do not print lines that do not contain
54 the field separator character.
56 The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers
57 or ranges separated by commas. The first byte, character, and field
58 are numbered 1.
60 A FILE of `-' means standard input. */
62 #include <config.h>
64 #include <stdio.h>
65 #include <assert.h>
66 #include <getopt.h>
67 #include <sys/types.h>
68 #include "system.h"
69 #include "error.h"
71 /* The official name of this program (e.g., no `g' prefix). */
72 #define PROGRAM_NAME "cut"
74 #define AUTHORS "David Ihnat, David MacKenzie, and Jim Meyering"
76 char *xstrdup ();
78 #define FATAL_ERROR(Message) \
79 do \
80 { \
81 error (0, 0, (Message)); \
82 usage (2); \
83 } \
84 while (0)
86 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
87 space if necessary. Update local variable N_RP. When allocating,
88 update global variable N_RP_ALLOCATED. */
90 #define ADD_RANGE_PAIR(rp, low, high) \
91 do \
92 { \
93 if (n_rp >= n_rp_allocated) \
94 { \
95 n_rp_allocated *= 2; \
96 (rp) = (struct range_pair *) xrealloc ((char *) (rp), \
97 n_rp_allocated * sizeof (*(rp))); \
98 } \
99 rp[n_rp].lo = (low); \
100 rp[n_rp].hi = (high); \
101 ++n_rp; \
103 while (0)
105 struct range_pair
107 unsigned int lo;
108 unsigned int hi;
111 /* This buffer is used to support the semantics of the -s option
112 (or lack of same) when the specified field list includes (does
113 not include) the first field. In both of those cases, the entire
114 first field must be read into this buffer to determine whether it
115 is followed by a delimiter or a newline before any of it may be
116 output. Otherwise, cut_fields can do the job without using this
117 buffer. */
118 static char *field_1_buffer;
120 /* The number of bytes allocated for FIELD_1_BUFFER. */
121 static int field_1_bufsize;
123 /* The largest field or byte index used as an endpoint of a closed
124 or degenerate range specification; this doesn't include the starting
125 index of right-open-ended ranges. For example, with either range spec
126 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
127 static unsigned int max_range_endpoint;
129 /* If nonzero, this is the index of the first field in a range that goes
130 to end of line. */
131 static unsigned int eol_range_start;
133 /* In byte mode, which bytes to output.
134 In field mode, which DELIM-separated fields to output.
135 Both bytes and fields are numbered starting with 1,
136 so the zeroth element of this array is unused.
137 A field or byte K has been selected if
138 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
139 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
140 static int *printable_field;
142 enum operating_mode
144 undefined_mode,
146 /* Output characters that are in the given bytes. */
147 byte_mode,
149 /* Output the given delimeter-separated fields. */
150 field_mode
153 /* The name this program was run with. */
154 char *program_name;
156 static enum operating_mode operating_mode;
158 /* If nonzero do not output lines containing no delimeter characters.
159 Otherwise, all such lines are printed. This option is valid only
160 with field mode. */
161 static int suppress_non_delimited;
163 /* The delimeter character for field mode. */
164 static int delim;
166 /* The length of output_delimiter_string. */
167 static size_t output_delimiter_length;
169 /* The output field separator string. Defaults to the 1-character
170 string consisting of the input delimiter. */
171 static char *output_delimiter_string;
173 /* Nonzero if we have ever read standard input. */
174 static int have_read_stdin;
176 static struct option const longopts[] =
178 {"bytes", required_argument, 0, 'b'},
179 {"characters", required_argument, 0, 'c'},
180 {"fields", required_argument, 0, 'f'},
181 {"delimiter", required_argument, 0, 'd'},
182 {"only-delimited", no_argument, 0, 's'},
183 {"output-delimiter", required_argument, 0, CHAR_MAX + 1},
184 {GETOPT_HELP_OPTION_DECL},
185 {GETOPT_VERSION_OPTION_DECL},
186 {0, 0, 0, 0}
189 void
190 usage (int status)
192 if (status != 0)
193 fprintf (stderr, _("Try `%s --help' for more information.\n"),
194 program_name);
195 else
197 printf (_("\
198 Usage: %s [OPTION]... [FILE]...\n\
200 program_name);
201 printf (_("\
202 Print selected parts of lines from each FILE to standard output.\n\
204 -b, --bytes=LIST output only these bytes\n\
205 -c, --characters=LIST output only these characters\n\
206 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
207 -f, --fields=LIST output only these fields\n\
208 -n (ignored)\n\
209 -s, --only-delimited do not print lines not containing delimiters\n\
210 --output-delimiter=STRING use STRING as the output delimiter\n\
211 the default is to use the input delimiter\n\
212 --help display this help and exit\n\
213 --version output version information and exit\n\
215 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
216 range, or many ranges separated by commas. Each range is one of:\n\
218 N N'th byte, character or field, counted from 1\n\
219 N- from N'th byte, character or field, to end of line\n\
220 N-M from N'th to M'th (included) byte, character or field\n\
221 -M from first to M'th (included) byte, character or field\n\
223 With no FILE, or when FILE is -, read standard input.\n\
224 "));
225 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
227 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
230 /* The following function was copied from getline.c, but with these changes:
231 - Read up to and including a newline or TERMINATOR, whichever comes first.
232 The original does not treat newline specially.
233 - Remove unused argument, OFFSET.
234 - Use xmalloc and xrealloc instead of malloc and realloc.
235 - Declare this function static. */
237 /* Always add at least this many bytes when extending the buffer. */
238 #define MIN_CHUNK 64
240 /* Read up to (and including) a newline or TERMINATOR from STREAM into
241 *LINEPTR (and null-terminate it). *LINEPTR is a pointer returned from
242 xmalloc (or NULL), pointing to *N characters of space. It is
243 xrealloc'd as necessary. Return the number of characters read (not
244 including the null terminator), or -1 on error or EOF. */
246 static int
247 getstr (char **lineptr, int *n, FILE *stream, int terminator)
249 int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
250 char *read_pos; /* Where we're reading into *LINEPTR. */
252 if (!lineptr || !n || !stream)
253 return -1;
255 if (!*lineptr)
257 *n = MIN_CHUNK;
258 *lineptr = (char *) xmalloc (*n);
259 if (!*lineptr)
260 return -1;
263 nchars_avail = *n;
264 read_pos = *lineptr;
266 for (;;)
268 register int c = getc (stream);
270 /* We always want at least one char left in the buffer, since we
271 always (unless we get an error while reading the first char)
272 NUL-terminate the line buffer. */
274 assert (*n - nchars_avail == read_pos - *lineptr);
275 if (nchars_avail < 1)
277 if (*n > MIN_CHUNK)
278 *n *= 2;
279 else
280 *n += MIN_CHUNK;
282 nchars_avail = *n + *lineptr - read_pos;
283 *lineptr = xrealloc (*lineptr, *n);
284 if (!*lineptr)
285 return -1;
286 read_pos = *n - nchars_avail + *lineptr;
287 assert (*n - nchars_avail == read_pos - *lineptr);
290 if (feof (stream) || ferror (stream))
292 /* Return partial line, if any. */
293 if (read_pos == *lineptr)
294 return -1;
295 else
296 break;
299 *read_pos++ = c;
300 nchars_avail--;
302 if (c == terminator || c == '\n')
303 /* Return the line. */
304 break;
307 /* Done - NUL terminate and return the number of chars read. */
308 *read_pos = '\0';
310 return read_pos - *lineptr;
313 static int
314 print_kth (unsigned int k)
316 return ((0 < eol_range_start && eol_range_start <= k)
317 || (k <= max_range_endpoint && printable_field[k]));
320 /* Given the list of field or byte range specifications FIELDSTR, set
321 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
322 array. If there is a right-open-ended range, set EOL_RANGE_START
323 to its starting index. FIELDSTR should be composed of one or more
324 numbers or ranges of numbers, separated by blanks or commas.
325 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
326 through end of line. Return nonzero if FIELDSTR contains at least
327 one field specification, zero otherwise. */
329 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
330 of some huge input file? This function shouldn't have to alloate a table
331 of a million ints just so we can test every field < 10^6 with an array
332 dereference. Instead, consider using a dynamic hash table. It would be
333 simpler and nearly as good a solution to use a 32K x 4-byte table with
334 one bit per field index instead of a whole `int' per index. */
336 static int
337 set_fields (const char *fieldstr)
339 unsigned int initial = 1; /* Value of first number in a range. */
340 unsigned int value = 0; /* If nonzero, a number being accumulated. */
341 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
342 int field_found = 0; /* Non-zero if at least one field spec
343 has been processed. */
345 struct range_pair *rp;
346 unsigned int n_rp;
347 unsigned int n_rp_allocated;
348 unsigned int i;
350 n_rp = 0;
351 n_rp_allocated = 16;
352 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
354 /* Collect and store in RP the range end points.
355 It also sets EOL_RANGE_START if appropriate. */
357 for (;;)
359 if (*fieldstr == '-')
361 /* Starting a range. */
362 if (dash_found)
363 FATAL_ERROR (_("invalid byte or field list"));
364 dash_found++;
365 fieldstr++;
367 if (value)
369 initial = value;
370 value = 0;
372 else
373 initial = 1;
375 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
377 /* Ending the string, or this field/byte sublist. */
378 if (dash_found)
380 dash_found = 0;
382 /* A range. Possibilites: -n, m-n, n-.
383 In any case, `initial' contains the start of the range. */
384 if (value == 0)
386 /* `n-'. From `initial' to end of line. */
387 eol_range_start = initial;
388 field_found = 1;
390 else
392 /* `m-n' or `-n' (1-n). */
393 if (value < initial)
394 FATAL_ERROR (_("invalid byte or field list"));
396 /* Is there already a range going to end of line? */
397 if (eol_range_start != 0)
399 /* Yes. Is the new sequence already contained
400 in the old one? If so, no processing is
401 necessary. */
402 if (initial < eol_range_start)
404 /* No, the new sequence starts before the
405 old. Does the old range going to end of line
406 extend into the new range? */
407 if (value + 1 >= eol_range_start)
409 /* Yes. Simply move the end of line marker. */
410 eol_range_start = initial;
412 else
414 /* No. A simple range, before and disjoint from
415 the range going to end of line. Fill it. */
416 ADD_RANGE_PAIR (rp, initial, value);
419 /* In any case, some fields were selected. */
420 field_found = 1;
423 else
425 /* There is no range going to end of line. */
426 ADD_RANGE_PAIR (rp, initial, value);
427 field_found = 1;
429 value = 0;
432 else if (value != 0)
434 /* A simple field number, not a range. */
435 ADD_RANGE_PAIR (rp, value, value);
436 value = 0;
437 field_found = 1;
440 if (*fieldstr == '\0')
442 break;
445 fieldstr++;
447 else if (ISDIGIT (*fieldstr))
449 /* FIXME: detect overflow? */
450 value = 10 * value + *fieldstr - '0';
451 fieldstr++;
453 else
454 FATAL_ERROR (_("invalid byte or field list"));
457 max_range_endpoint = 0;
458 for (i = 0; i < n_rp; i++)
460 if (rp[i].hi > max_range_endpoint)
461 max_range_endpoint = rp[i].hi;
464 /* Allocate an array large enough so that it may be indexed by
465 the field numbers corresponding to all finite ranges
466 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
468 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
469 memset (printable_field, 0, (max_range_endpoint + 1) * sizeof (int));
471 /* Set the array entries corresponding to integers in the ranges of RP. */
472 for (i = 0; i < n_rp; i++)
474 unsigned int j;
475 for (j = rp[i].lo; j <= rp[i].hi; j++)
477 printable_field[j] = 1;
481 free (rp);
483 return field_found;
486 /* Read from stream STREAM, printing to standard output any selected bytes. */
488 static void
489 cut_bytes (FILE *stream)
491 unsigned int byte_idx; /* Number of chars in the line so far. */
493 byte_idx = 0;
494 while (1)
496 register int c; /* Each character from the file. */
498 c = getc (stream);
500 if (c == '\n')
502 putchar ('\n');
503 byte_idx = 0;
505 else if (c == EOF)
507 if (byte_idx > 0)
508 putchar ('\n');
509 break;
511 else
513 ++byte_idx;
514 if (print_kth (byte_idx))
516 putchar (c);
522 /* Read from stream STREAM, printing to standard output any selected fields. */
524 static void
525 cut_fields (FILE *stream)
527 int c;
528 unsigned int field_idx;
529 int found_any_selected_field;
530 int buffer_first_field;
531 int empty_input;
533 found_any_selected_field = 0;
534 field_idx = 1;
536 c = getc (stream);
537 empty_input = (c == EOF);
538 if (c != EOF)
539 ungetc (c, stream);
541 /* To support the semantics of the -s flag, we may have to buffer
542 all of the first field to determine whether it is `delimited.'
543 But that is unnecessary if all non-delimited lines must be printed
544 and the first field has been selected, or if non-delimited lines
545 must be suppressed and the first field has *not* been selected.
546 That is because a non-delimited line has exactly one field. */
547 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
549 while (1)
551 if (field_idx == 1 && buffer_first_field)
553 int len;
555 len = getstr (&field_1_buffer, &field_1_bufsize, stream, delim);
556 if (len < 0)
557 break;
559 assert (len != 0);
561 /* If the first field extends to the end of line (it is not
562 delimited) and we are printing all non-delimited lines,
563 print this one. */
564 if ((unsigned char) field_1_buffer[len - 1] != delim)
566 if (suppress_non_delimited)
568 /* Empty. */
570 else
572 fwrite (field_1_buffer, sizeof (char), len, stdout);
573 /* Make sure the output line is newline terminated. */
574 if (field_1_buffer[len - 1] != '\n')
575 putchar ('\n');
577 continue;
579 if (print_kth (1))
581 /* Print the field, but not the trailing delimiter. */
582 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
583 found_any_selected_field = 1;
585 ++field_idx;
588 if (c != EOF)
590 if (print_kth (field_idx))
592 if (found_any_selected_field)
594 fwrite (output_delimiter_string, sizeof (char),
595 output_delimiter_length, stdout);
597 found_any_selected_field = 1;
599 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
601 putchar (c);
604 else
606 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
608 /* Empty. */
613 if (c == '\n')
615 c = getc (stream);
616 if (c != EOF)
618 ungetc (c, stream);
619 c = '\n';
623 if (c == delim)
624 ++field_idx;
625 else if (c == '\n' || c == EOF)
627 if (found_any_selected_field
628 || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
629 putchar ('\n');
630 if (c == EOF)
631 break;
632 field_idx = 1;
633 found_any_selected_field = 0;
638 static void
639 cut_stream (FILE *stream)
641 if (operating_mode == byte_mode)
642 cut_bytes (stream);
643 else
644 cut_fields (stream);
647 /* Process file FILE to standard output.
648 Return 0 if successful, 1 if not. */
650 static int
651 cut_file (char *file)
653 FILE *stream;
655 if (STREQ (file, "-"))
657 have_read_stdin = 1;
658 stream = stdin;
660 else
662 stream = fopen (file, "r");
663 if (stream == NULL)
665 error (0, errno, "%s", file);
666 return 1;
670 cut_stream (stream);
672 if (ferror (stream))
674 error (0, errno, "%s", file);
675 return 1;
677 if (STREQ (file, "-"))
678 clearerr (stream); /* Also clear EOF. */
679 else if (fclose (stream) == EOF)
681 error (0, errno, "%s", file);
682 return 1;
684 return 0;
688 main (int argc, char **argv)
690 int optc, exit_status = 0;
691 int delim_specified = 0;
693 program_name = argv[0];
694 setlocale (LC_ALL, "");
695 bindtextdomain (PACKAGE, LOCALEDIR);
696 textdomain (PACKAGE);
698 operating_mode = undefined_mode;
700 /* By default, all non-delimited lines are printed. */
701 suppress_non_delimited = 0;
703 delim = '\0';
704 have_read_stdin = 0;
706 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, NULL)) != -1)
708 switch (optc)
710 case 0:
711 break;
713 case 'b':
714 case 'c':
715 /* Build the byte list. */
716 if (operating_mode != undefined_mode)
717 FATAL_ERROR (_("only one type of list may be specified"));
718 operating_mode = byte_mode;
719 if (set_fields (optarg) == 0)
720 FATAL_ERROR (_("missing list of positions"));
721 break;
723 case 'f':
724 /* Build the field list. */
725 if (operating_mode != undefined_mode)
726 FATAL_ERROR (_("only one type of list may be specified"));
727 operating_mode = field_mode;
728 if (set_fields (optarg) == 0)
729 FATAL_ERROR (_("missing list of fields"));
730 break;
732 case 'd':
733 /* New delimiter. */
734 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
735 if (optarg[0] != '\0' && optarg[1] != '\0')
736 FATAL_ERROR (_("the delimiter must be a single character"));
737 delim = (unsigned char) optarg[0];
738 delim_specified = 1;
739 break;
741 case CHAR_MAX + 1:
742 /* Interpret --output-delimiter='' to mean
743 `use the NUL byte as the delimiter.' */
744 output_delimiter_length = (optarg[0] == '\0'
745 ? 1 : strlen (optarg));
746 output_delimiter_string = xstrdup (optarg);
747 break;
749 case 'n':
750 break;
752 case 's':
753 suppress_non_delimited = 1;
754 break;
756 case_GETOPT_HELP_CHAR;
758 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
760 default:
761 usage (2);
765 if (operating_mode == undefined_mode)
766 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
768 if (delim != '\0' && operating_mode != field_mode)
769 FATAL_ERROR (_("a delimiter may be specified only when operating on fields"));
771 if (suppress_non_delimited && operating_mode != field_mode)
772 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
773 \tonly when operating on fields"));
775 if (!delim_specified)
776 delim = '\t';
778 if (output_delimiter_string == NULL)
780 static char dummy[2];
781 dummy[0] = delim;
782 dummy[1] = '\0';
783 output_delimiter_string = dummy;
784 output_delimiter_length = 1;
787 if (optind == argc)
788 exit_status |= cut_file ("-");
789 else
790 for (; optind < argc; optind++)
791 exit_status |= cut_file (argv[optind]);
793 if (have_read_stdin && fclose (stdin) == EOF)
795 error (0, errno, "-");
796 exit_status = 1;
798 if (ferror (stdout) || fclose (stdout) == EOF)
799 error (EXIT_FAILURE, errno, _("write error"));
801 exit (exit_status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);