1 /* split.c -- split a file into pieces.
2 Copyright (C) 1988-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* By tege@sics.se, with rms.
20 * support -p REGEX as in BSD's split.
21 * support --suppress-matched as in csplit. */
24 #include <stdckdint.h>
28 #include <sys/types.h>
32 #include "alignalloc.h"
35 #include "fd-reopen.h"
37 #include "full-write.h"
39 #include "ioblksize.h"
42 #include "sys-limits.h"
43 #include "temp-stream.h"
44 #include "xbinary-io.h"
45 #include "xdectoint.h"
48 /* The official name of this program (e.g., no 'g' prefix). */
49 #define PROGRAM_NAME "split"
52 proper_name ("Torbjorn Granlund"), \
53 proper_name ("Richard M. Stallman")
55 /* Shell command to filter through, instead of creating files. */
56 static char const *filter_command
;
58 /* Process ID of the filter. */
59 static pid_t filter_pid
;
61 /* Array of open pipes. */
62 static int *open_pipes
;
63 static idx_t open_pipes_alloc
;
64 static int n_open_pipes
;
66 /* Whether SIGPIPE has the default action, when --filter is used. */
67 static bool default_SIGPIPE
;
69 /* Base name of output files. */
70 static char const *outbase
;
72 /* Name of output files. */
75 /* Pointer to the end of the prefix in OUTFILE.
76 Suffixes are inserted here. */
77 static char *outfile_mid
;
79 /* Generate new suffix when suffixes are exhausted. */
80 static bool suffix_auto
= true;
82 /* Length of OUTFILE's suffix. */
83 static idx_t suffix_length
;
85 /* Alphabet of characters to use in suffix. */
86 static char const *suffix_alphabet
= "abcdefghijklmnopqrstuvwxyz";
88 /* Numerical suffix start value. */
89 static char const *numeric_suffix_start
;
91 /* Additional suffix to append to output file names. */
92 static char const *additional_suffix
;
94 /* Name of input file. May be "-". */
97 /* stat buf for input file. */
98 static struct stat in_stat_buf
;
100 /* Descriptor on which output file is open. */
101 static int output_desc
= -1;
103 /* If true, print a diagnostic on standard error just before each
104 output file is opened. */
107 /* If true, don't generate zero length output files. */
108 static bool elide_empty_files
;
110 /* If true, in round robin mode, immediately copy
111 input to output, which is much slower, so disabled by default. */
112 static bool unbuffered
;
114 /* The character marking end of line. Defaults to \n below. */
115 static int eolchar
= -1;
117 /* The split mode to use. */
120 type_undef
, type_bytes
, type_byteslines
, type_lines
, type_digits
,
121 type_chunk_bytes
, type_chunk_lines
, type_rr
124 /* For long options that have no equivalent short option, use a
125 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
128 VERBOSE_OPTION
= CHAR_MAX
+ 1,
131 ADDITIONAL_SUFFIX_OPTION
134 static struct option
const longopts
[] =
136 {"bytes", required_argument
, nullptr, 'b'},
137 {"lines", required_argument
, nullptr, 'l'},
138 {"line-bytes", required_argument
, nullptr, 'C'},
139 {"number", required_argument
, nullptr, 'n'},
140 {"elide-empty-files", no_argument
, nullptr, 'e'},
141 {"unbuffered", no_argument
, nullptr, 'u'},
142 {"suffix-length", required_argument
, nullptr, 'a'},
143 {"additional-suffix", required_argument
, nullptr,
144 ADDITIONAL_SUFFIX_OPTION
},
145 {"numeric-suffixes", optional_argument
, nullptr, 'd'},
146 {"hex-suffixes", optional_argument
, nullptr, 'x'},
147 {"filter", required_argument
, nullptr, FILTER_OPTION
},
148 {"verbose", no_argument
, nullptr, VERBOSE_OPTION
},
149 {"separator", required_argument
, nullptr, 't'},
150 {"-io-blksize", required_argument
, nullptr,
151 IO_BLKSIZE_OPTION
}, /* do not document */
152 {GETOPT_HELP_OPTION_DECL
},
153 {GETOPT_VERSION_OPTION_DECL
},
154 {nullptr, 0, nullptr, 0}
157 /* Return true if the errno value, ERR, is ignorable. */
161 return filter_command
&& err
== EPIPE
;
165 set_suffix_length (intmax_t n_units
, enum Split_type split_type
)
167 #define DEFAULT_SUFFIX_LENGTH 2
169 int suffix_length_needed
= 0;
171 /* The suffix auto length feature is incompatible with
172 a user specified start value as the generated suffixes
173 are not all consecutive. */
174 if (numeric_suffix_start
)
177 /* Auto-calculate the suffix length if the number of files is given. */
178 if (split_type
== type_chunk_bytes
|| split_type
== type_chunk_lines
179 || split_type
== type_rr
)
181 intmax_t n_units_end
= n_units
- 1;
182 if (numeric_suffix_start
)
185 strtol_error e
= xstrtoimax (numeric_suffix_start
, nullptr, 10,
187 if (e
== LONGINT_OK
&& n_start
< n_units
)
189 /* Restrict auto adjustment so we don't keep
190 incrementing a suffix size arbitrarily,
191 as that would break sort order for files
192 generated from multiple split runs. */
193 if (ckd_add (&n_units_end
, n_units_end
, n_start
))
194 n_units_end
= INTMAX_MAX
;
198 idx_t alphabet_len
= strlen (suffix_alphabet
);
200 suffix_length_needed
++;
201 while (n_units_end
/= alphabet_len
);
206 if (suffix_length
) /* set by user */
208 if (suffix_length
< suffix_length_needed
)
209 error (EXIT_FAILURE
, 0,
210 _("the suffix length needs to be at least %d"),
211 suffix_length_needed
);
216 suffix_length
= MAX (DEFAULT_SUFFIX_LENGTH
, suffix_length_needed
);
222 if (status
!= EXIT_SUCCESS
)
227 Usage: %s [OPTION]... [FILE [PREFIX]]\n\
231 Output pieces of FILE to PREFIXaa, PREFIXab, ...;\n\
232 default size is 1000 lines, and default PREFIX is 'x'.\n\
236 emit_mandatory_arg_note ();
238 fprintf (stdout
, _("\
239 -a, --suffix-length=N generate suffixes of length N (default %d)\n\
240 --additional-suffix=SUFFIX append an additional SUFFIX to file names\n\
241 -b, --bytes=SIZE put SIZE bytes per output file\n\
242 -C, --line-bytes=SIZE put at most SIZE bytes of records per output file\n\
243 -d use numeric suffixes starting at 0, not alphabetic\n\
244 --numeric-suffixes[=FROM] same as -d, but allow setting the start value\
246 -x use hex suffixes starting at 0, not alphabetic\n\
247 --hex-suffixes[=FROM] same as -x, but allow setting the start value\n\
248 -e, --elide-empty-files do not generate empty output files with '-n'\n\
249 --filter=COMMAND write to shell COMMAND; file name is $FILE\n\
250 -l, --lines=NUMBER put NUMBER lines/records per output file\n\
251 -n, --number=CHUNKS generate CHUNKS output files; see explanation below\n\
252 -t, --separator=SEP use SEP instead of newline as the record separator;\n\
253 '\\0' (zero) specifies the NUL character\n\
254 -u, --unbuffered immediately copy input to output with '-n r/...'\n\
255 "), DEFAULT_SUFFIX_LENGTH
);
257 --verbose print a diagnostic just before each\n\
258 output file is opened\n\
260 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
261 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
265 N split into N files based on size of input\n\
266 K/N output Kth of N to stdout\n\
267 l/N split into N files without splitting lines/records\n\
268 l/K/N output Kth of N to stdout without splitting lines/records\n\
269 r/N like 'l' but use round robin distribution\n\
270 r/K/N likewise but only output Kth of N to stdout\n\
272 emit_ancillary_info (PROGRAM_NAME
);
277 /* Copy the data in FD to a temporary file, then make that file FD.
278 Use BUF, of size BUFSIZE, to copy. Return the number of
279 bytes copied, or -1 (setting errno) on error. */
281 copy_to_tmpfile (int fd
, char *buf
, idx_t bufsize
)
284 if (!temp_stream (&tmp
, nullptr))
289 while (0 < (r
= read (fd
, buf
, bufsize
)))
291 if (fwrite (buf
, 1, r
, tmp
) != r
)
293 if (ckd_add (&copied
, copied
, r
))
302 r
= dup2 (fileno (tmp
), fd
);
305 if (fclose (tmp
) < 0)
310 /* Return the number of bytes that can be read from FD with status ST.
311 Store up to the first BUFSIZE bytes of the file's data into BUF,
312 and advance the file position by the number of bytes read. On
313 input error, set errno and return -1. */
316 input_file_size (int fd
, struct stat
const *st
, char *buf
, idx_t bufsize
)
321 ssize_t n_read
= read (fd
, buf
+ size
, bufsize
- size
);
323 return n_read
< 0 ? n_read
: size
;
326 while (size
< bufsize
);
329 if ((usable_st_size (st
) && st
->st_size
< size
)
330 || (cur
= lseek (fd
, 0, SEEK_CUR
)) < 0
331 || cur
< size
/* E.g., /dev/zero on GNU/Linux. */
332 || (end
= lseek (fd
, 0, SEEK_END
)) < 0)
334 char *tmpbuf
= xmalloc (bufsize
);
335 end
= copy_to_tmpfile (fd
, tmpbuf
, bufsize
);
342 if (end
== OFF_T_MAX
/* E.g., /dev/zero on GNU/Hurd. */
343 || (cur
< end
&& ckd_add (&size
, size
, end
- cur
)))
351 off_t r
= lseek (fd
, cur
, SEEK_SET
);
359 /* Compute the next sequential output file name and store it into the
363 next_file_name (void)
365 /* Index in suffix_alphabet of each character in the suffix. */
366 static idx_t
*sufindex
;
367 static idx_t outbase_length
;
368 static idx_t outfile_length
;
369 static idx_t addsuf_length
;
373 bool overflow
, widen
;
376 widen
= !! outfile_length
;
380 /* Allocate and initialize the first file name. */
382 outbase_length
= strlen (outbase
);
383 addsuf_length
= additional_suffix
? strlen (additional_suffix
) : 0;
384 overflow
= ckd_add (&outfile_length
, outbase_length
+ addsuf_length
,
389 /* Reallocate and initialize a new wider file name.
390 We do this by subsuming the unchanging part of
391 the generated suffix into the prefix (base), and
392 reinitializing the now one longer suffix. */
394 overflow
= ckd_add (&outfile_length
, outfile_length
, 2);
399 overflow
|= ckd_add (&outfile_size
, outfile_length
, 1);
402 outfile
= xirealloc (outfile
, outfile_size
);
405 memcpy (outfile
, outbase
, outbase_length
);
408 /* Append the last alphabet character to the file name prefix. */
409 outfile
[outbase_length
] = suffix_alphabet
[sufindex
[0]];
413 outfile_mid
= outfile
+ outbase_length
;
414 memset (outfile_mid
, suffix_alphabet
[0], suffix_length
);
415 if (additional_suffix
)
416 memcpy (outfile_mid
+ suffix_length
, additional_suffix
, addsuf_length
);
417 outfile
[outfile_length
] = 0;
420 sufindex
= xicalloc (suffix_length
, sizeof *sufindex
);
422 if (numeric_suffix_start
)
426 /* Update the output file name. */
427 idx_t i
= strlen (numeric_suffix_start
);
428 memcpy (outfile_mid
+ suffix_length
- i
, numeric_suffix_start
, i
);
430 /* Update the suffix index. */
431 idx_t
*sufindex_end
= sufindex
+ suffix_length
;
433 *--sufindex_end
= numeric_suffix_start
[i
] - '0';
436 #if ! _POSIX_NO_TRUNC && HAVE_PATHCONF && defined _PC_NAME_MAX
437 /* POSIX requires that if the output file name is too long for
438 its directory, 'split' must fail without creating any files.
439 This must be checked for explicitly on operating systems that
440 silently truncate file names. */
442 char *dir
= dir_name (outfile
);
443 long name_max
= pathconf (dir
, _PC_NAME_MAX
);
444 if (0 <= name_max
&& name_max
< base_len (last_component (outfile
)))
445 error (EXIT_FAILURE
, ENAMETOOLONG
, "%s", quotef (outfile
));
452 /* Increment the suffix in place, if possible. */
454 idx_t i
= suffix_length
;
458 if (suffix_auto
&& i
== 0 && ! suffix_alphabet
[sufindex
[0] + 1])
460 outfile_mid
[i
] = suffix_alphabet
[sufindex
[i
]];
464 outfile_mid
[i
] = suffix_alphabet
[sufindex
[i
]];
466 error (EXIT_FAILURE
, 0, _("output file suffixes exhausted"));
470 /* Create or truncate a file. */
473 create (char const *name
)
478 fprintf (stdout
, _("creating file %s\n"), quoteaf (name
));
480 int oflags
= O_WRONLY
| O_CREAT
| O_BINARY
;
481 int fd
= open (name
, oflags
| O_EXCL
, MODE_RW_UGO
);
482 if (0 <= fd
|| errno
!= EEXIST
)
484 fd
= open (name
, oflags
, MODE_RW_UGO
);
487 struct stat out_stat_buf
;
488 if (fstat (fd
, &out_stat_buf
) != 0)
489 error (EXIT_FAILURE
, errno
, _("failed to stat %s"), quoteaf (name
));
490 if (SAME_INODE (in_stat_buf
, out_stat_buf
))
491 error (EXIT_FAILURE
, 0, _("%s would overwrite input; aborting"),
494 = S_ISREG (out_stat_buf
.st_mode
) || S_TYPEISSHM (&out_stat_buf
);
495 if (! (regularish
&& out_stat_buf
.st_size
== 0)
496 && ftruncate (fd
, 0) < 0 && regularish
)
497 error (EXIT_FAILURE
, errno
, _("%s: error truncating"), quotef (name
));
505 char const *shell_prog
= getenv ("SHELL");
506 if (shell_prog
== nullptr)
507 shell_prog
= "/bin/sh";
508 if (setenv ("FILE", name
, 1) != 0)
509 error (EXIT_FAILURE
, errno
,
510 _("failed to set FILE environment variable"));
512 fprintf (stdout
, _("executing with FILE=%s\n"), quotef (name
));
513 if (pipe (fd_pair
) != 0)
514 error (EXIT_FAILURE
, errno
, _("failed to create pipe"));
518 /* This is the child process. If an error occurs here, the
519 parent will eventually learn about it after doing a wait,
520 at which time it will emit its own error message. */
522 /* We have to close any pipes that were opened during an
523 earlier call, otherwise this process will be holding a
524 write-pipe that will prevent the earlier process from
525 reading an EOF on the corresponding read-pipe. */
526 for (j
= 0; j
< n_open_pipes
; ++j
)
527 if (close (open_pipes
[j
]) != 0)
528 error (EXIT_FAILURE
, errno
, _("closing prior pipe"));
529 if (close (fd_pair
[1]))
530 error (EXIT_FAILURE
, errno
, _("closing output pipe"));
531 if (fd_pair
[0] != STDIN_FILENO
)
533 if (dup2 (fd_pair
[0], STDIN_FILENO
) != STDIN_FILENO
)
534 error (EXIT_FAILURE
, errno
, _("moving input pipe"));
535 if (close (fd_pair
[0]) != 0)
536 error (EXIT_FAILURE
, errno
, _("closing input pipe"));
539 signal (SIGPIPE
, SIG_DFL
);
540 execl (shell_prog
, last_component (shell_prog
), "-c",
541 filter_command
, (char *) nullptr);
542 error (EXIT_FAILURE
, errno
, _("failed to run command: \"%s -c %s\""),
543 shell_prog
, filter_command
);
546 error (EXIT_FAILURE
, errno
, _("fork system call failed"));
547 if (close (fd_pair
[0]) != 0)
548 error (EXIT_FAILURE
, errno
, _("failed to close input pipe"));
549 filter_pid
= child_pid
;
550 if (n_open_pipes
== open_pipes_alloc
)
551 open_pipes
= xpalloc (open_pipes
, &open_pipes_alloc
, 1,
552 MIN (INT_MAX
, IDX_MAX
), sizeof *open_pipes
);
553 open_pipes
[n_open_pipes
++] = fd_pair
[1];
558 /* Close the output file, and do any associated cleanup.
559 If FP and FD are both specified, they refer to the same open file;
560 in this case FP is closed, but FD is still used in cleanup. */
562 closeout (FILE *fp
, int fd
, pid_t pid
, char const *name
)
564 if (fp
!= nullptr && fclose (fp
) != 0 && ! ignorable (errno
))
565 error (EXIT_FAILURE
, errno
, "%s", quotef (name
));
568 if (fp
== nullptr && close (fd
) < 0)
569 error (EXIT_FAILURE
, errno
, "%s", quotef (name
));
571 for (j
= 0; j
< n_open_pipes
; ++j
)
573 if (open_pipes
[j
] == fd
)
575 open_pipes
[j
] = open_pipes
[--n_open_pipes
];
583 if (waitpid (pid
, &wstatus
, 0) < 0)
584 error (EXIT_FAILURE
, errno
, _("waiting for child process"));
585 else if (WIFSIGNALED (wstatus
))
587 int sig
= WTERMSIG (wstatus
);
590 char signame
[MAX (SIG2STR_MAX
, INT_BUFSIZE_BOUND (int))];
591 if (sig2str (sig
, signame
) != 0)
592 sprintf (signame
, "%d", sig
);
594 _("with FILE=%s, signal %s from command: %s"),
595 quotef (name
), signame
, filter_command
);
598 else if (WIFEXITED (wstatus
))
600 int ex
= WEXITSTATUS (wstatus
);
602 error (ex
, 0, _("with FILE=%s, exit %d from command: %s"),
603 quotef (name
), ex
, filter_command
);
607 /* shouldn't happen. */
608 error (EXIT_FAILURE
, 0,
609 _("unknown status from command (0x%X)"), wstatus
+ 0u);
614 /* Write BYTES bytes at BP to an output file.
615 If NEW_FILE_FLAG is true, open the next output file.
616 Otherwise add to the same output file already in use.
617 Return true if successful. */
620 cwrite (bool new_file_flag
, char const *bp
, idx_t bytes
)
624 if (!bp
&& bytes
== 0 && elide_empty_files
)
626 closeout (nullptr, output_desc
, filter_pid
, outfile
);
628 output_desc
= create (outfile
);
630 error (EXIT_FAILURE
, errno
, "%s", quotef (outfile
));
633 if (full_write (output_desc
, bp
, bytes
) == bytes
)
637 if (! ignorable (errno
))
638 error (EXIT_FAILURE
, errno
, "%s", quotef (outfile
));
643 /* Split into pieces of exactly N_BYTES bytes.
644 However, the first REM_BYTES pieces should be 1 byte longer.
645 Use buffer BUF, whose size is BUFSIZE.
646 If INITIAL_READ is nonnegative,
647 BUF contains the first INITIAL_READ input bytes. */
650 bytes_split (intmax_t n_bytes
, intmax_t rem_bytes
,
651 char *buf
, idx_t bufsize
, ssize_t initial_read
,
654 bool new_file_flag
= true;
655 bool filter_ok
= true;
657 intmax_t to_write
= n_bytes
+ (0 < rem_bytes
);
658 bool eof
= ! to_write
;
663 if (0 <= initial_read
)
665 n_read
= initial_read
;
667 eof
= n_read
< bufsize
;
672 && 0 <= lseek (STDIN_FILENO
, to_write
, SEEK_CUR
))
674 to_write
= n_bytes
+ (opened
+ 1 < rem_bytes
);
675 new_file_flag
= true;
678 n_read
= read (STDIN_FILENO
, buf
, bufsize
);
680 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
684 while (0 < to_write
&& to_write
<= n_read
)
686 if (filter_ok
|| new_file_flag
)
687 filter_ok
= cwrite (new_file_flag
, bp_out
, to_write
);
688 opened
+= new_file_flag
;
689 new_file_flag
= !max_files
|| (opened
< max_files
);
690 if (! filter_ok
&& ! new_file_flag
)
692 /* If filters no longer accepting input, stop reading. */
699 to_write
= n_bytes
+ (opened
< rem_bytes
);
703 if (filter_ok
|| new_file_flag
)
704 filter_ok
= cwrite (new_file_flag
, bp_out
, n_read
);
705 opened
+= new_file_flag
;
706 new_file_flag
= false;
707 if (! filter_ok
&& opened
== max_files
)
709 /* If filters no longer accepting input, stop reading. */
716 /* Ensure NUMBER files are created, which truncates
717 any existing files or notifies any consumers on fifos.
718 FIXME: Should we do this before EXIT_FAILURE? */
719 while (opened
++ < max_files
)
720 cwrite (true, nullptr, 0);
723 /* Split into pieces of exactly N_LINES lines.
724 Use buffer BUF, whose size is BUFSIZE. */
727 lines_split (intmax_t n_lines
, char *buf
, idx_t bufsize
)
730 char *bp
, *bp_out
, *eob
;
731 bool new_file_flag
= true;
736 n_read
= read (STDIN_FILENO
, buf
, bufsize
);
738 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
744 bp
= rawmemchr (bp
, eolchar
);
747 if (eob
!= bp_out
) /* do not write 0 bytes! */
749 idx_t len
= eob
- bp_out
;
750 cwrite (new_file_flag
, bp_out
, len
);
751 new_file_flag
= false;
759 cwrite (new_file_flag
, bp_out
, bp
- bp_out
);
761 new_file_flag
= true;
769 /* Split into pieces that are as large as possible while still not more
770 than N_BYTES bytes, and are split on line boundaries except
771 where lines longer than N_BYTES bytes occur. */
774 line_bytes_split (intmax_t n_bytes
, char *buf
, idx_t bufsize
)
777 intmax_t n_out
= 0; /* for each split. */
779 char *hold
= nullptr; /* for lines > bufsize. */
781 bool split_line
= false; /* Whether a \n was output in a split. */
785 n_read
= read (STDIN_FILENO
, buf
, bufsize
);
787 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
788 idx_t n_left
= n_read
;
792 idx_t split_rest
= 0;
796 /* Determine End Of Chunk and/or End of Line,
797 which are used below to select what to write or buffer. */
798 if (n_bytes
- n_out
- n_hold
<= n_left
)
800 /* Have enough for split. */
801 split_rest
= n_bytes
- n_out
- n_hold
;
802 eoc
= sob
+ split_rest
- 1;
803 eol
= memrchr (sob
, eolchar
, split_rest
);
806 eol
= memrchr (sob
, eolchar
, n_left
);
808 /* Output hold space if possible. */
809 if (n_hold
&& !(!eol
&& n_out
))
811 cwrite (n_out
== 0, hold
, n_hold
);
813 if (n_hold
> bufsize
)
814 hold
= xirealloc (hold
, bufsize
);
819 /* Output to eol if present. */
823 idx_t n_write
= eol
- sob
+ 1;
824 cwrite (n_out
== 0, sob
, n_write
);
829 split_rest
-= n_write
;
832 /* Output to eoc or eob if possible. */
833 if (n_left
&& !split_line
)
835 idx_t n_write
= eoc
? split_rest
: n_left
;
836 cwrite (n_out
== 0, sob
, n_write
);
841 split_rest
-= n_write
;
844 /* Update hold if needed. */
845 if ((eoc
&& split_rest
) || (!eoc
&& n_left
))
847 idx_t n_buf
= eoc
? split_rest
: n_left
;
848 if (hold_size
- n_hold
< n_buf
)
849 hold
= xpalloc (hold
, &hold_size
, n_buf
- (hold_size
- n_hold
),
851 memcpy (hold
+ n_hold
, sob
, n_buf
);
857 /* Reset for new split. */
867 /* Handle no eol at end of file. */
869 cwrite (n_out
== 0, hold
, n_hold
);
874 /* -n l/[K/]N: Write lines to files of approximately file size / N.
875 The file is partitioned into file size / N sized portions, with the
876 last assigned any excess. If a line _starts_ within a partition
877 it is written completely to the corresponding file. Since lines
878 are not split even if they overlap a partition, the files written
879 can be larger or smaller than the partition size, and even empty
880 if a line is so long as to completely overlap the partition. */
883 lines_chunk_split (intmax_t k
, intmax_t n
, char *buf
, idx_t bufsize
,
884 ssize_t initial_read
, off_t file_size
)
886 affirm (n
&& k
<= n
);
888 intmax_t rem_bytes
= file_size
% n
;
889 off_t chunk_size
= file_size
/ n
;
890 intmax_t chunk_no
= 1;
891 off_t chunk_end
= chunk_size
+ (0 < rem_bytes
);
893 bool new_file_flag
= true;
894 bool chunk_truncated
= false;
896 if (k
> 1 && 0 < file_size
)
898 /* Start reading 1 byte before kth chunk of file. */
899 off_t start
= (k
- 1) * chunk_size
+ MIN (k
- 1, rem_bytes
) - 1;
900 if (start
< initial_read
)
902 memmove (buf
, buf
+ start
, initial_read
- start
);
903 initial_read
-= start
;
907 if (initial_read
< start
908 && lseek (STDIN_FILENO
, start
- initial_read
, SEEK_CUR
) < 0)
909 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
914 chunk_end
= start
+ 1;
917 while (n_written
< file_size
)
919 char *bp
= buf
, *eob
;
921 if (0 <= initial_read
)
923 n_read
= initial_read
;
928 n_read
= read (STDIN_FILENO
, buf
,
929 MIN (bufsize
, file_size
- n_written
));
931 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
935 chunk_truncated
= false;
943 /* Begin looking for '\n' at last byte of chunk. */
944 off_t skip
= MIN (n_read
, MAX (0, chunk_end
- 1 - n_written
));
945 char *bp_out
= memchr (bp
+ skip
, eolchar
, n_read
- skip
);
953 to_write
= bp_out
- bp
;
957 /* We don't use the stdout buffer here since we're writing
958 large chunks from an existing file, so it's more efficient
959 to write out directly. */
960 if (full_write (STDOUT_FILENO
, bp
, to_write
) != to_write
)
964 cwrite (new_file_flag
, bp
, to_write
);
965 n_written
+= to_write
;
968 new_file_flag
= next
;
970 /* A line could have been so long that it skipped
971 entire chunks. So create empty files in that case. */
972 while (next
|| chunk_end
<= n_written
)
974 if (!next
&& bp
== eob
)
976 /* replenish buf, before going to next chunk. */
977 chunk_truncated
= true;
982 chunk_end
+= chunk_size
+ (chunk_no
< rem_bytes
);
984 if (chunk_end
<= n_written
)
987 cwrite (true, nullptr, 0);
998 /* Ensure NUMBER files are created, which truncates
999 any existing files or notifies any consumers on fifos.
1000 FIXME: Should we do this before EXIT_FAILURE? */
1002 while (chunk_no
++ <= n
)
1003 cwrite (true, nullptr, 0);
1006 /* -n K/N: Extract Kth of N chunks. */
1009 bytes_chunk_extract (intmax_t k
, intmax_t n
, char *buf
, idx_t bufsize
,
1010 ssize_t initial_read
, off_t file_size
)
1015 assert (0 < k
&& k
<= n
);
1017 start
= (k
- 1) * (file_size
/ n
) + MIN (k
- 1, file_size
% n
);
1018 end
= k
== n
? file_size
: k
* (file_size
/ n
) + MIN (k
, file_size
% n
);
1020 if (start
< initial_read
)
1022 memmove (buf
, buf
+ start
, initial_read
- start
);
1023 initial_read
-= start
;
1027 if (initial_read
< start
1028 && lseek (STDIN_FILENO
, start
- initial_read
, SEEK_CUR
) < 0)
1029 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1036 if (0 <= initial_read
)
1038 n_read
= initial_read
;
1043 n_read
= read (STDIN_FILENO
, buf
, bufsize
);
1045 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1049 n_read
= MIN (n_read
, end
- start
);
1050 if (full_write (STDOUT_FILENO
, buf
, n_read
) != n_read
1051 && ! ignorable (errno
))
1052 error (EXIT_FAILURE
, errno
, "%s", quotef ("-"));
1057 typedef struct of_info
1071 /* Rotate file descriptors when we're writing to more output files than we
1072 have available file descriptors.
1073 Return whether we came under file resource pressure.
1074 If so, it's probably best to close each file when finished with it. */
1077 ofile_open (of_t
*files
, idx_t i_check
, idx_t nfiles
)
1079 bool file_limit
= false;
1081 if (files
[i_check
].ofd
<= OFD_NEW
)
1084 idx_t i_reopen
= i_check
? i_check
- 1 : nfiles
- 1;
1086 /* Another process could have opened a file in between the calls to
1087 close and open, so we should keep trying until open succeeds or
1088 we've closed all of our files. */
1091 if (files
[i_check
].ofd
== OFD_NEW
)
1092 fd
= create (files
[i_check
].of_name
);
1093 else /* OFD_APPEND */
1095 /* Attempt to append to previously opened file.
1096 We use O_NONBLOCK to support writing to fifos,
1097 where the other end has closed because of our
1098 previous close. In that case we'll immediately
1099 get an error, rather than waiting indefinitely.
1100 In specialised cases the consumer can keep reading
1101 from the fifo, terminating on conditions in the data
1102 itself, or perhaps never in the case of 'tail -f'.
1103 I.e., for fifos it is valid to attempt this reopen.
1105 We don't handle the filter_command case here, as create()
1106 will exit if there are not enough files in that case.
1107 I.e., we don't support restarting filters, as that would
1108 put too much burden on users specifying --filter commands. */
1109 fd
= open (files
[i_check
].of_name
,
1110 O_WRONLY
| O_BINARY
| O_APPEND
| O_NONBLOCK
);
1116 if (!(errno
== EMFILE
|| errno
== ENFILE
))
1117 error (EXIT_FAILURE
, errno
, "%s", quotef (files
[i_check
].of_name
));
1121 /* Search backwards for an open file to close. */
1122 while (files
[i_reopen
].ofd
< 0)
1124 i_reopen
= i_reopen
? i_reopen
- 1 : nfiles
- 1;
1125 /* No more open files to close, exit with E[NM]FILE. */
1126 if (i_reopen
== i_check
)
1127 error (EXIT_FAILURE
, errno
, "%s",
1128 quotef (files
[i_check
].of_name
));
1131 if (fclose (files
[i_reopen
].ofile
) != 0)
1132 error (EXIT_FAILURE
, errno
, "%s", quotef (files
[i_reopen
].of_name
));
1133 files
[i_reopen
].ofile
= nullptr;
1134 files
[i_reopen
].ofd
= OFD_APPEND
;
1137 files
[i_check
].ofd
= fd
;
1138 FILE *ofile
= fdopen (fd
, "a");
1140 error (EXIT_FAILURE
, errno
, "%s", quotef (files
[i_check
].of_name
));
1141 files
[i_check
].ofile
= ofile
;
1142 files
[i_check
].opid
= filter_pid
;
1149 /* -n r/[K/]N: Divide file into N chunks in round robin fashion.
1150 Use BUF of size BUFSIZE for the buffer, and if allocating storage
1151 put its address into *FILESP to pacify -fsanitize=leak.
1152 When K == 0, we try to keep the files open in parallel.
1153 If we run out of file resources, then we revert
1154 to opening and closing each file for each line. */
1157 lines_rr (intmax_t k
, intmax_t n
, char *buf
, idx_t bufsize
, of_t
**filesp
)
1159 bool wrapped
= false;
1163 of_t
*files
IF_LINT (= nullptr);
1172 files
= *filesp
= xinmalloc (n
, sizeof *files
);
1174 /* Generate output file names. */
1175 for (i_file
= 0; i_file
< n
; i_file
++)
1178 files
[i_file
].of_name
= xstrdup (outfile
);
1179 files
[i_file
].ofd
= OFD_NEW
;
1180 files
[i_file
].ofile
= nullptr;
1181 files
[i_file
].opid
= 0;
1189 char *bp
= buf
, *eob
;
1190 ssize_t n_read
= read (STDIN_FILENO
, buf
, bufsize
);
1192 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1193 else if (n_read
== 0)
1202 /* Find end of line. */
1203 char *bp_out
= memchr (bp
, eolchar
, eob
- bp
);
1211 to_write
= bp_out
- bp
;
1215 if (line_no
== k
&& unbuffered
)
1217 if (full_write (STDOUT_FILENO
, bp
, to_write
) != to_write
)
1220 else if (line_no
== k
&& fwrite (bp
, to_write
, 1, stdout
) != 1)
1225 line_no
= (line_no
== n
) ? 1 : line_no
+ 1;
1229 /* Secure file descriptor. */
1230 file_limit
|= ofile_open (files
, i_file
, n
);
1233 /* Note writing to fd, rather than flushing the FILE gives
1234 an 8% performance benefit, due to reduced data copying. */
1235 if (full_write (files
[i_file
].ofd
, bp
, to_write
) != to_write
1236 && ! ignorable (errno
))
1237 error (EXIT_FAILURE
, errno
, "%s",
1238 quotef (files
[i_file
].of_name
));
1240 else if (fwrite (bp
, to_write
, 1, files
[i_file
].ofile
) != 1
1241 && ! ignorable (errno
))
1242 error (EXIT_FAILURE
, errno
, "%s",
1243 quotef (files
[i_file
].of_name
));
1245 if (! ignorable (errno
))
1250 if (fclose (files
[i_file
].ofile
) != 0)
1251 error (EXIT_FAILURE
, errno
, "%s",
1252 quotef (files
[i_file
].of_name
));
1253 files
[i_file
].ofile
= nullptr;
1254 files
[i_file
].ofd
= OFD_APPEND
;
1256 if (next
&& ++i_file
== n
)
1259 /* If no filters are accepting input, stop reading. */
1272 /* Ensure all files created, so that any existing files are truncated,
1273 and to signal any waiting fifo consumers.
1274 Also, close any open file descriptors.
1275 FIXME: Should we do this before EXIT_FAILURE? */
1278 idx_t ceiling
= wrapped
? n
: i_file
;
1279 for (i_file
= 0; i_file
< n
; i_file
++)
1281 if (i_file
>= ceiling
&& !elide_empty_files
)
1282 file_limit
|= ofile_open (files
, i_file
, n
);
1283 if (files
[i_file
].ofd
>= 0)
1284 closeout (files
[i_file
].ofile
, files
[i_file
].ofd
,
1285 files
[i_file
].opid
, files
[i_file
].of_name
);
1286 files
[i_file
].ofd
= OFD_APPEND
;
1291 #define FAIL_ONLY_ONE_WAY() \
1294 error (0, 0, _("cannot split in more than one way")); \
1295 usage (EXIT_FAILURE); \
1299 /* Report a string-to-integer conversion failure MSGID with ARG. */
1301 static _Noreturn
void
1302 strtoint_die (char const *msgid
, char const *arg
)
1304 error (EXIT_FAILURE
, errno
== EINVAL
? 0 : errno
, "%s: %s",
1305 gettext (msgid
), quote (arg
));
1308 /* Use OVERFLOW_OK when it is OK to ignore LONGINT_OVERFLOW errors, since the
1309 extreme value will do the right thing anyway on any practical platform. */
1310 #define OVERFLOW_OK LONGINT_OVERFLOW
1312 /* Parse ARG for number of bytes or lines. The number can be followed
1313 by MULTIPLIERS, and the resulting value must be positive.
1314 If the number cannot be parsed, diagnose with MSG.
1315 Return the number parsed, or an INTMAX_MAX on overflow. */
1318 parse_n_units (char const *arg
, char const *multipliers
, char const *msgid
)
1321 if (OVERFLOW_OK
< xstrtoimax (arg
, nullptr, 10, &n
, multipliers
) || n
< 1)
1322 strtoint_die (msgid
, arg
);
1326 /* Parse K/N syntax of chunk options. */
1329 parse_chunk (intmax_t *k_units
, intmax_t *n_units
, char const *arg
)
1332 strtol_error e
= xstrtoimax (arg
, &argend
, 10, n_units
, "");
1333 if (e
== LONGINT_INVALID_SUFFIX_CHAR
&& *argend
== '/')
1335 *k_units
= *n_units
;
1336 *n_units
= parse_n_units (argend
+ 1, "",
1337 N_("invalid number of chunks"));
1338 if (! (0 < *k_units
&& *k_units
<= *n_units
))
1339 error (EXIT_FAILURE
, 0, "%s: %s", _("invalid chunk number"),
1340 quote_mem (arg
, argend
- arg
));
1342 else if (! (e
<= OVERFLOW_OK
&& 0 < *n_units
))
1343 strtoint_die (N_("invalid number of chunks"), arg
);
1348 main (int argc
, char **argv
)
1350 enum Split_type split_type
= type_undef
;
1351 idx_t in_blk_size
= 0; /* optimal block size of input file device */
1352 idx_t page_size
= getpagesize ();
1353 intmax_t k_units
= 0;
1354 intmax_t n_units
= 0;
1356 static char const multipliers
[] = "bEGKkMmPQRTYZ0";
1358 int digits_optind
= 0;
1359 off_t file_size
= OFF_T_MAX
;
1361 initialize_main (&argc
, &argv
);
1362 set_program_name (argv
[0]);
1363 setlocale (LC_ALL
, "");
1364 bindtextdomain (PACKAGE
, LOCALEDIR
);
1365 textdomain (PACKAGE
);
1367 atexit (close_stdout
);
1369 /* Parse command line options. */
1371 infile
= bad_cast ("-");
1372 outbase
= bad_cast ("x");
1376 /* This is the argv-index of the option we will read next. */
1377 int this_optind
= optind
? optind
: 1;
1379 c
= getopt_long (argc
, argv
, "0123456789C:a:b:del:n:t:ux",
1387 suffix_length
= xdectoimax (optarg
, 0, IDX_MAX
,
1388 "", _("invalid suffix length"), 0);
1391 case ADDITIONAL_SUFFIX_OPTION
:
1393 int suffix_len
= strlen (optarg
);
1394 if (last_component (optarg
) != optarg
1395 || (suffix_len
&& ISSLASH (optarg
[suffix_len
- 1])))
1398 _("invalid suffix %s, contains directory separator"),
1400 usage (EXIT_FAILURE
);
1403 additional_suffix
= optarg
;
1407 if (split_type
!= type_undef
)
1408 FAIL_ONLY_ONE_WAY ();
1409 split_type
= type_bytes
;
1410 n_units
= parse_n_units (optarg
, multipliers
,
1411 N_("invalid number of bytes"));
1415 if (split_type
!= type_undef
)
1416 FAIL_ONLY_ONE_WAY ();
1417 split_type
= type_lines
;
1418 n_units
= parse_n_units (optarg
, "", N_("invalid number of lines"));
1422 if (split_type
!= type_undef
)
1423 FAIL_ONLY_ONE_WAY ();
1424 split_type
= type_byteslines
;
1425 n_units
= parse_n_units (optarg
, multipliers
,
1426 N_("invalid number of lines"));
1430 if (split_type
!= type_undef
)
1431 FAIL_ONLY_ONE_WAY ();
1432 /* skip any whitespace */
1433 while (isspace (to_uchar (*optarg
)))
1435 if (STRNCMP_LIT (optarg
, "r/") == 0)
1437 split_type
= type_rr
;
1440 else if (STRNCMP_LIT (optarg
, "l/") == 0)
1442 split_type
= type_chunk_lines
;
1446 split_type
= type_chunk_bytes
;
1447 parse_chunk (&k_units
, &n_units
, optarg
);
1456 char neweol
= optarg
[0];
1458 error (EXIT_FAILURE
, 0, _("empty record separator"));
1461 if (STREQ (optarg
, "\\0"))
1465 /* Provoke with 'split -txx'. Complain about
1466 "multi-character tab" instead of "multibyte tab", so
1467 that the diagnostic's wording does not need to be
1468 changed once multibyte characters are supported. */
1469 error (EXIT_FAILURE
, 0, _("multi-character separator %s"),
1473 /* Make it explicit we don't support multiple separators. */
1474 if (0 <= eolchar
&& neweol
!= eolchar
)
1476 error (EXIT_FAILURE
, 0,
1477 _("multiple separator characters specified"));
1494 if (split_type
== type_undef
)
1496 split_type
= type_digits
;
1499 if (split_type
!= type_undef
&& split_type
!= type_digits
)
1500 FAIL_ONLY_ONE_WAY ();
1501 if (digits_optind
!= 0 && digits_optind
!= this_optind
)
1502 n_units
= 0; /* More than one number given; ignore other. */
1503 digits_optind
= this_optind
;
1504 if (ckd_mul (&n_units
, n_units
, 10)
1505 || ckd_add (&n_units
, n_units
, c
- '0'))
1506 n_units
= INTMAX_MAX
;
1512 suffix_alphabet
= "0123456789";
1514 suffix_alphabet
= "0123456789abcdef";
1517 if (strlen (optarg
) != strspn (optarg
, suffix_alphabet
))
1521 _("%s: invalid start value for numerical suffix") :
1522 _("%s: invalid start value for hexadecimal suffix"),
1524 usage (EXIT_FAILURE
);
1528 /* Skip any leading zero. */
1529 while (*optarg
== '0' && *(optarg
+ 1) != '\0')
1531 numeric_suffix_start
= optarg
;
1537 elide_empty_files
= true;
1541 filter_command
= optarg
;
1544 case IO_BLKSIZE_OPTION
:
1545 in_blk_size
= xdectoumax (optarg
, 1,
1546 MIN (SYS_BUFSIZE_MAX
,
1547 MIN (IDX_MAX
, SIZE_MAX
) - 1),
1548 multipliers
, _("invalid IO block size"), 0);
1551 case VERBOSE_OPTION
:
1555 case_GETOPT_HELP_CHAR
;
1557 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
1560 usage (EXIT_FAILURE
);
1564 if (k_units
!= 0 && filter_command
)
1566 error (0, 0, _("--filter does not process a chunk extracted to stdout"));
1567 usage (EXIT_FAILURE
);
1570 /* Handle default case. */
1571 if (split_type
== type_undef
)
1573 split_type
= type_lines
;
1579 error (0, 0, _("invalid number of lines: %s"), quote ("0"));
1580 usage (EXIT_FAILURE
);
1586 set_suffix_length (n_units
, split_type
);
1588 /* Get out the filename arguments. */
1591 infile
= argv
[optind
++];
1594 outbase
= argv
[optind
++];
1598 error (0, 0, _("extra operand %s"), quote (argv
[optind
]));
1599 usage (EXIT_FAILURE
);
1602 /* Check that the suffix length is large enough for the numerical
1603 suffix start value. */
1604 if (numeric_suffix_start
&& strlen (numeric_suffix_start
) > suffix_length
)
1606 error (0, 0, _("numerical suffix start value is too large "
1607 "for the suffix length"));
1608 usage (EXIT_FAILURE
);
1611 /* Open the input file. */
1612 if (! STREQ (infile
, "-")
1613 && fd_reopen (STDIN_FILENO
, infile
, O_RDONLY
, 0) < 0)
1614 error (EXIT_FAILURE
, errno
, _("cannot open %s for reading"),
1617 /* Binary I/O is safer when byte counts are used. */
1618 xset_binary_mode (STDIN_FILENO
, O_BINARY
);
1620 /* Advise the kernel of our access pattern. */
1621 fdadvise (STDIN_FILENO
, 0, 0, FADVISE_SEQUENTIAL
);
1623 /* Get the optimal block size of input device and make a buffer. */
1625 if (fstat (STDIN_FILENO
, &in_stat_buf
) != 0)
1626 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1628 if (in_blk_size
== 0)
1630 in_blk_size
= io_blksize (in_stat_buf
);
1631 if (SYS_BUFSIZE_MAX
< in_blk_size
)
1632 in_blk_size
= SYS_BUFSIZE_MAX
;
1635 char *buf
= xalignalloc (page_size
, in_blk_size
+ 1);
1636 ssize_t initial_read
= -1;
1638 if (split_type
== type_chunk_bytes
|| split_type
== type_chunk_lines
)
1640 file_size
= input_file_size (STDIN_FILENO
, &in_stat_buf
,
1643 error (EXIT_FAILURE
, errno
, _("%s: cannot determine file size"),
1645 initial_read
= MIN (file_size
, in_blk_size
);
1648 /* When filtering, closure of one pipe must not terminate the process,
1649 as there may still be other streams expecting input from us. */
1651 default_SIGPIPE
= signal (SIGPIPE
, SIG_IGN
) == SIG_DFL
;
1657 lines_split (n_units
, buf
, in_blk_size
);
1661 bytes_split (n_units
, 0, buf
, in_blk_size
, -1, 0);
1664 case type_byteslines
:
1665 line_bytes_split (n_units
, buf
, in_blk_size
);
1668 case type_chunk_bytes
:
1670 bytes_split (file_size
/ n_units
, file_size
% n_units
,
1671 buf
, in_blk_size
, initial_read
, n_units
);
1673 bytes_chunk_extract (k_units
, n_units
, buf
, in_blk_size
, initial_read
,
1677 case type_chunk_lines
:
1678 lines_chunk_split (k_units
, n_units
, buf
, in_blk_size
, initial_read
,
1683 /* Note, this is like 'sed -n ${k}~${n}p' when k > 0,
1684 but the functionality is provided for symmetry. */
1687 lines_rr (k_units
, n_units
, buf
, in_blk_size
, &files
);
1695 if (close (STDIN_FILENO
) != 0)
1696 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
1697 closeout (nullptr, output_desc
, filter_pid
, outfile
);
1699 main_exit (EXIT_SUCCESS
);