tests: add fold(1) test for --bytes option
[coreutils.git] / src / cut.c
blob061e09c331537508909b59d1246fe1f3dd2a2248
1 /* cut - remove parts of lines of files
2 Copyright (C) 1997-2024 Free Software Foundation, Inc.
3 Copyright (C) 1984 David M. Ihnat
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 /* Written by David Ihnat. */
20 /* POSIX changes, bug fixes, long-named options, and cleanup
21 by David MacKenzie <djm@gnu.ai.mit.edu>.
23 Rewrite cut_fields and cut_bytes -- Jim Meyering. */
25 #include <config.h>
27 #include <stdio.h>
28 #include <getopt.h>
29 #include <sys/types.h>
30 #include "system.h"
32 #include "assure.h"
33 #include "fadvise.h"
34 #include "getndelim2.h"
36 #include "set-fields.h"
38 /* The official name of this program (e.g., no 'g' prefix). */
39 #define PROGRAM_NAME "cut"
41 #define AUTHORS \
42 proper_name ("David M. Ihnat"), \
43 proper_name ("David MacKenzie"), \
44 proper_name ("Jim Meyering")
46 #define FATAL_ERROR(Message) \
47 do \
48 { \
49 error (0, 0, (Message)); \
50 usage (EXIT_FAILURE); \
51 } \
52 while (0)
55 /* Pointer inside RP. When checking if a byte or field is selected
56 by a finite range, we check if it is between CURRENT_RP.LO
57 and CURRENT_RP.HI. If the byte or field index is greater than
58 CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
59 static struct field_range_pair *current_rp;
61 /* This buffer is used to support the semantics of the -s option
62 (or lack of same) when the specified field list includes (does
63 not include) the first field. In both of those cases, the entire
64 first field must be read into this buffer to determine whether it
65 is followed by a delimiter or a newline before any of it may be
66 output. Otherwise, cut_fields can do the job without using this
67 buffer. */
68 static char *field_1_buffer;
70 /* The number of bytes allocated for FIELD_1_BUFFER. */
71 static size_t field_1_bufsize;
73 /* If true, do not output lines containing no delimiter characters.
74 Otherwise, all such lines are printed. This option is valid only
75 with field mode. */
76 static bool suppress_non_delimited;
78 /* If true, print all bytes, characters, or fields _except_
79 those that were specified. */
80 static bool complement;
82 /* The delimiter character for field mode. */
83 static unsigned char delim;
85 /* The delimiter for each line/record. */
86 static unsigned char line_delim = '\n';
88 /* The length of output_delimiter_string. */
89 static size_t output_delimiter_length;
91 /* The output field separator string. Defaults to the 1-character
92 string consisting of the input delimiter. */
93 static char *output_delimiter_string;
95 /* The output delimiter string contents, if the default. */
96 static char output_delimiter_default[1];
98 /* True if we have ever read standard input. */
99 static bool have_read_stdin;
101 /* For long options that have no equivalent short option, use a
102 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
103 enum
105 OUTPUT_DELIMITER_OPTION = CHAR_MAX + 1,
106 COMPLEMENT_OPTION
109 static struct option const longopts[] =
111 {"bytes", required_argument, nullptr, 'b'},
112 {"characters", required_argument, nullptr, 'c'},
113 {"fields", required_argument, nullptr, 'f'},
114 {"delimiter", required_argument, nullptr, 'd'},
115 {"only-delimited", no_argument, nullptr, 's'},
116 {"output-delimiter", required_argument, nullptr, OUTPUT_DELIMITER_OPTION},
117 {"complement", no_argument, nullptr, COMPLEMENT_OPTION},
118 {"zero-terminated", no_argument, nullptr, 'z'},
119 {GETOPT_HELP_OPTION_DECL},
120 {GETOPT_VERSION_OPTION_DECL},
121 {nullptr, 0, nullptr, 0}
124 void
125 usage (int status)
127 if (status != EXIT_SUCCESS)
128 emit_try_help ();
129 else
131 printf (_("\
132 Usage: %s OPTION... [FILE]...\n\
134 program_name);
135 fputs (_("\
136 Print selected parts of lines from each FILE to standard output.\n\
137 "), stdout);
139 emit_stdin_note ();
140 emit_mandatory_arg_note ();
142 fputs (_("\
143 -b, --bytes=LIST select only these bytes\n\
144 -c, --characters=LIST select only these characters\n\
145 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
146 "), stdout);
147 fputs (_("\
148 -f, --fields=LIST select only these fields; also print any line\n\
149 that contains no delimiter character, unless\n\
150 the -s option is specified\n\
151 -n (ignored)\n\
152 "), stdout);
153 fputs (_("\
154 --complement complement the set of selected bytes, characters\n\
155 or fields\n\
156 "), stdout);
157 fputs (_("\
158 -s, --only-delimited do not print lines not containing delimiters\n\
159 --output-delimiter=STRING use STRING as the output delimiter\n\
160 the default is to use the input delimiter\n\
161 "), stdout);
162 fputs (_("\
163 -z, --zero-terminated line delimiter is NUL, not newline\n\
164 "), stdout);
165 fputs (HELP_OPTION_DESCRIPTION, stdout);
166 fputs (VERSION_OPTION_DESCRIPTION, stdout);
167 fputs (_("\
169 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
170 range, or many ranges separated by commas. Selected input is written\n\
171 in the same order that it is read, and is written exactly once.\n\
172 "), stdout);
173 fputs (_("\
174 Each range is one of:\n\
176 N N'th byte, character or field, counted from 1\n\
177 N- from N'th byte, character or field, to end of line\n\
178 N-M from N'th to M'th (included) byte, character or field\n\
179 -M from first to M'th (included) byte, character or field\n\
180 "), stdout);
181 emit_ancillary_info (PROGRAM_NAME);
183 exit (status);
187 /* Increment *ITEM_IDX (i.e., a field or byte index),
188 and if required CURRENT_RP. */
190 static inline void
191 next_item (uintmax_t *item_idx)
193 (*item_idx)++;
194 if ((*item_idx) > current_rp->hi)
195 current_rp++;
198 /* Return nonzero if the K'th field or byte is printable. */
200 static inline bool
201 print_kth (uintmax_t k)
203 return current_rp->lo <= k;
206 /* Return nonzero if K'th byte is the beginning of a range. */
208 static inline bool
209 is_range_start_index (uintmax_t k)
211 return k == current_rp->lo;
214 /* Read from stream STREAM, printing to standard output any selected bytes. */
216 static void
217 cut_bytes (FILE *stream)
219 uintmax_t byte_idx; /* Number of bytes in the line so far. */
220 /* Whether to begin printing delimiters between ranges for the current line.
221 Set after we've begun printing data corresponding to the first range. */
222 bool print_delimiter;
224 byte_idx = 0;
225 print_delimiter = false;
226 current_rp = frp;
227 while (true)
229 int c; /* Each character from the file. */
231 c = getc (stream);
233 if (c == line_delim)
235 if (putchar (c) < 0)
236 write_error ();
237 byte_idx = 0;
238 print_delimiter = false;
239 current_rp = frp;
241 else if (c == EOF)
243 if (byte_idx > 0)
245 if (putchar (line_delim) < 0)
246 write_error ();
248 break;
250 else
252 next_item (&byte_idx);
253 if (print_kth (byte_idx))
255 if (output_delimiter_string != output_delimiter_default)
257 if (print_delimiter && is_range_start_index (byte_idx))
259 if (fwrite (output_delimiter_string, sizeof (char),
260 output_delimiter_length, stdout)
261 != output_delimiter_length)
262 write_error ();
264 print_delimiter = true;
267 if (putchar (c) < 0)
268 write_error ();
274 /* Read from stream STREAM, printing to standard output any selected fields. */
276 static void
277 cut_fields (FILE *stream)
279 int c; /* Each character from the file. */
280 uintmax_t field_idx = 1;
281 bool found_any_selected_field = false;
282 bool buffer_first_field;
284 current_rp = frp;
286 c = getc (stream);
287 if (c == EOF)
288 return;
290 ungetc (c, stream);
291 c = 0;
293 /* To support the semantics of the -s flag, we may have to buffer
294 all of the first field to determine whether it is 'delimited.'
295 But that is unnecessary if all non-delimited lines must be printed
296 and the first field has been selected, or if non-delimited lines
297 must be suppressed and the first field has *not* been selected.
298 That is because a non-delimited line has exactly one field. */
299 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
301 while (true)
303 if (field_idx == 1 && buffer_first_field)
305 ssize_t len;
306 size_t n_bytes;
308 len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0,
309 GETNLINE_NO_LIMIT, delim, line_delim, stream);
310 if (len < 0)
312 free (field_1_buffer);
313 field_1_buffer = nullptr;
314 if (ferror (stream) || feof (stream))
315 break;
316 xalloc_die ();
319 n_bytes = len;
320 affirm (n_bytes != 0);
322 c = 0;
324 /* If the first field extends to the end of line (it is not
325 delimited) and we are printing all non-delimited lines,
326 print this one. */
327 if (to_uchar (field_1_buffer[n_bytes - 1]) != delim)
329 if (suppress_non_delimited)
331 /* Empty. */
333 else
335 if (fwrite (field_1_buffer, sizeof (char), n_bytes, stdout)
336 != n_bytes)
337 write_error ();
338 /* Make sure the output line is newline terminated. */
339 if (field_1_buffer[n_bytes - 1] != line_delim)
341 if (putchar (line_delim) < 0)
342 write_error ();
344 c = line_delim;
346 continue;
349 if (print_kth (1))
351 /* Print the field, but not the trailing delimiter. */
352 if (fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout)
353 != n_bytes - 1)
354 write_error ();
356 /* With -d$'\n' don't treat the last '\n' as a delimiter. */
357 if (delim == line_delim)
359 int last_c = getc (stream);
360 if (last_c != EOF)
362 ungetc (last_c, stream);
363 found_any_selected_field = true;
366 else
368 found_any_selected_field = true;
371 next_item (&field_idx);
374 int prev_c = c;
376 if (print_kth (field_idx))
378 if (found_any_selected_field)
380 if (fwrite (output_delimiter_string, sizeof (char),
381 output_delimiter_length, stdout)
382 != output_delimiter_length)
383 write_error ();
385 found_any_selected_field = true;
387 while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
389 if (putchar (c) < 0)
390 write_error ();
391 prev_c = c;
394 else
396 while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
397 prev_c = c;
400 /* With -d$'\n' don't treat the last '\n' as a delimiter. */
401 if (delim == line_delim && c == delim)
403 int last_c = getc (stream);
404 if (last_c != EOF)
405 ungetc (last_c, stream);
406 else
407 c = last_c;
410 if (c == delim)
411 next_item (&field_idx);
412 else if (c == line_delim || c == EOF)
414 if (found_any_selected_field
415 || !(suppress_non_delimited && field_idx == 1))
417 /* Make sure the output line is newline terminated. */
418 if (c == line_delim || prev_c != line_delim
419 || delim == line_delim)
421 if (putchar (line_delim) < 0)
422 write_error ();
425 if (c == EOF)
426 break;
428 /* Start processing the next input line. */
429 field_idx = 1;
430 current_rp = frp;
431 found_any_selected_field = false;
436 /* Process file FILE to standard output, using CUT_STREAM.
437 Return true if successful. */
439 static bool
440 cut_file (char const *file, void (*cut_stream) (FILE *))
442 FILE *stream;
444 if (STREQ (file, "-"))
446 have_read_stdin = true;
447 stream = stdin;
448 assume (stream); /* Pacify GCC bug#109613. */
450 else
452 stream = fopen (file, "r");
453 if (stream == nullptr)
455 error (0, errno, "%s", quotef (file));
456 return false;
460 fadvise (stream, FADVISE_SEQUENTIAL);
462 cut_stream (stream);
464 int err = errno;
465 if (!ferror (stream))
466 err = 0;
467 if (STREQ (file, "-"))
468 clearerr (stream); /* Also clear EOF. */
469 else if (fclose (stream) == EOF)
470 err = errno;
471 if (err)
473 error (0, err, "%s", quotef (file));
474 return false;
476 return true;
480 main (int argc, char **argv)
482 int optc;
483 bool ok;
484 bool delim_specified = false;
485 bool byte_mode = false;
486 char *spec_list_string = nullptr;
488 initialize_main (&argc, &argv);
489 set_program_name (argv[0]);
490 setlocale (LC_ALL, "");
491 bindtextdomain (PACKAGE, LOCALEDIR);
492 textdomain (PACKAGE);
494 atexit (close_stdout);
496 /* By default, all non-delimited lines are printed. */
497 suppress_non_delimited = false;
499 delim = '\0';
500 have_read_stdin = false;
502 while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, nullptr))
503 != -1)
505 switch (optc)
507 case 'b':
508 case 'c':
509 /* Build the byte list. */
510 byte_mode = true;
511 FALLTHROUGH;
512 case 'f':
513 /* Build the field list. */
514 if (spec_list_string)
515 FATAL_ERROR (_("only one list may be specified"));
516 spec_list_string = optarg;
517 break;
519 case 'd':
520 /* New delimiter. */
521 /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
522 if (optarg[0] != '\0' && optarg[1] != '\0')
523 FATAL_ERROR (_("the delimiter must be a single character"));
524 delim = optarg[0];
525 delim_specified = true;
526 break;
528 case OUTPUT_DELIMITER_OPTION:
529 /* Interpret --output-delimiter='' to mean
530 'use the NUL byte as the delimiter.' */
531 output_delimiter_length = (optarg[0] == '\0'
532 ? 1 : strlen (optarg));
533 output_delimiter_string = optarg;
534 break;
536 case 'n':
537 break;
539 case 's':
540 suppress_non_delimited = true;
541 break;
543 case 'z':
544 line_delim = '\0';
545 break;
547 case COMPLEMENT_OPTION:
548 complement = true;
549 break;
551 case_GETOPT_HELP_CHAR;
552 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
553 default:
554 usage (EXIT_FAILURE);
558 if (!spec_list_string)
559 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
561 if (byte_mode)
563 if (delim_specified)
564 FATAL_ERROR (_("an input delimiter may be specified only\
565 when operating on fields"));
567 if (suppress_non_delimited)
568 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
569 \tonly when operating on fields"));
572 set_fields (spec_list_string,
573 ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0)
574 | (complement ? SETFLD_COMPLEMENT : 0)));
576 if (!delim_specified)
577 delim = '\t';
579 if (output_delimiter_string == nullptr)
581 output_delimiter_default[0] = delim;
582 output_delimiter_string = output_delimiter_default;
583 output_delimiter_length = 1;
586 void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields;
587 if (optind == argc)
588 ok = cut_file ("-", cut_stream);
589 else
590 for (ok = true; optind < argc; optind++)
591 ok &= cut_file (argv[optind], cut_stream);
594 if (have_read_stdin && fclose (stdin) == EOF)
596 error (0, errno, "-");
597 ok = false;
600 return ok ? EXIT_SUCCESS : EXIT_FAILURE;