shuf: tiny simplification
[coreutils.git] / src / sort.c
blob3357733c109262222d80ed598a70a11e57cf3a7b
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <ctype.h>
26 #include <getopt.h>
27 #include <pthread.h>
28 #include <sys/resource.h>
29 #include <sys/types.h>
30 #include <sys/wait.h>
31 #include <signal.h>
32 #include "system.h"
33 #include "argmatch.h"
34 #include "assure.h"
35 #include "fadvise.h"
36 #include "filevercmp.h"
37 #include "flexmember.h"
38 #include "hard-locale.h"
39 #include "hash.h"
40 #include "heap.h"
41 #include "ignore-value.h"
42 #include "mbswidth.h"
43 #include "nproc.h"
44 #include "physmem.h"
45 #include "posixver.h"
46 #include "quote.h"
47 #include "randread.h"
48 #include "readtokens0.h"
49 #include "stdlib--.h"
50 #include "strnumcmp.h"
51 #include "xmemcoll.h"
52 #include "xnanosleep.h"
53 #include "xstrtol.h"
54 #include "xstrtol-error.h"
56 #ifndef RLIMIT_DATA
57 struct rlimit { size_t rlim_cur; };
58 # define getrlimit(Resource, Rlp) (-1)
59 #endif
61 /* The official name of this program (e.g., no 'g' prefix). */
62 #define PROGRAM_NAME "sort"
64 #define AUTHORS \
65 proper_name ("Mike Haertel"), \
66 proper_name ("Paul Eggert")
68 #if HAVE_LANGINFO_CODESET
69 # include <langinfo.h>
70 #endif
72 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
73 present. */
74 #ifndef SA_NOCLDSTOP
75 # define SA_NOCLDSTOP 0
76 /* No sigprocmask. Always 'return' zero. */
77 # define sigprocmask(How, Set, Oset) (0)
78 # define sigset_t int
79 # if ! HAVE_SIGINTERRUPT
80 # define siginterrupt(sig, flag) /* empty */
81 # endif
82 #endif
84 #if !defined OPEN_MAX && defined NR_OPEN
85 # define OPEN_MAX NR_OPEN
86 #endif
87 #if !defined OPEN_MAX
88 # define OPEN_MAX 20
89 #endif
91 #define UCHAR_LIM (UCHAR_MAX + 1)
93 #ifndef DEFAULT_TMPDIR
94 # define DEFAULT_TMPDIR "/tmp"
95 #endif
97 /* Maximum number of lines to merge every time a NODE is taken from
98 the merge queue. Node is at LEVEL in the binary merge tree,
99 and is responsible for merging TOTAL lines. */
100 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
102 /* Heuristic value for the number of lines for which it is worth creating
103 a subthread, during an internal merge sort. I.e., it is a small number
104 of "average" lines for which sorting via two threads is faster than
105 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
106 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
107 lines of gensort -a output is sorted slightly faster with --parallel=2
108 than with --parallel=1. By contrast, using --parallel=1 is about 10%
109 faster than using --parallel=2 with a 64K-line input. */
110 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
111 static_assert (4 <= SUBTHREAD_LINES_HEURISTIC);
113 /* The number of threads after which there are
114 diminishing performance gains. */
115 enum { DEFAULT_MAX_THREADS = 8 };
117 /* Exit statuses. */
118 enum
120 /* POSIX says to exit with status 1 if invoked with -c and the
121 input is not properly sorted. */
122 SORT_OUT_OF_ORDER = 1,
124 /* POSIX says any other irregular exit must exit with a status
125 code greater than 1. */
126 SORT_FAILURE = 2
129 enum
131 /* The number of times we should try to fork a compression process
132 (we retry if the fork call fails). We don't _need_ to compress
133 temp files, this is just to reduce file system access, so this number
134 can be small. Each retry doubles in duration. */
135 MAX_FORK_TRIES_COMPRESS = 4,
137 /* The number of times we should try to fork a decompression process.
138 If we can't fork a decompression process, we can't sort, so this
139 number should be big. Each retry doubles in duration. */
140 MAX_FORK_TRIES_DECOMPRESS = 9
143 enum
145 /* Level of the end-of-merge node, one level above the root. */
146 MERGE_END = 0,
148 /* Level of the root node in merge tree. */
149 MERGE_ROOT = 1
152 /* The representation of the decimal point in the current locale. */
153 static char decimal_point;
155 /* Thousands separator; if outside char range, there is no separator. */
156 static int thousands_sep;
157 /* We currently ignore multi-byte grouping chars. */
158 static bool thousands_sep_ignored;
160 /* Nonzero if the corresponding locales are hard. */
161 static bool hard_LC_COLLATE;
162 #if HAVE_NL_LANGINFO
163 static bool hard_LC_TIME;
164 #endif
166 #define NONZERO(x) ((x) != 0)
168 /* The kind of blanks for '-b' to skip in various options. */
169 enum blanktype { bl_start, bl_end, bl_both };
171 /* The character marking end of line. Default to \n. */
172 static char eolchar = '\n';
174 /* Lines are held in memory as counted strings. */
175 struct line
177 char *text; /* Text of the line. */
178 size_t length; /* Length including final newline. */
179 char *keybeg; /* Start of first key. */
180 char *keylim; /* Limit of first key. */
183 /* Input buffers. */
184 struct buffer
186 char *buf; /* Dynamically allocated buffer,
187 partitioned into 3 regions:
188 - input data;
189 - unused area;
190 - an array of lines, in reverse order. */
191 size_t used; /* Number of bytes used for input data. */
192 size_t nlines; /* Number of lines in the line array. */
193 size_t alloc; /* Number of bytes allocated. */
194 size_t left; /* Number of bytes left from previous reads. */
195 size_t line_bytes; /* Number of bytes to reserve for each line. */
196 bool eof; /* An EOF has been read. */
199 /* Sort key. */
200 struct keyfield
202 size_t sword; /* Zero-origin 'word' to start at. */
203 size_t schar; /* Additional characters to skip. */
204 size_t eword; /* Zero-origin last 'word' of key. */
205 size_t echar; /* Additional characters in field. */
206 bool const *ignore; /* Boolean array of characters to ignore. */
207 char const *translate; /* Translation applied to characters. */
208 bool skipsblanks; /* Skip leading blanks when finding start. */
209 bool skipeblanks; /* Skip leading blanks when finding end. */
210 bool numeric; /* Flag for numeric comparison. Handle
211 strings of digits with optional decimal
212 point, but no exponential notation. */
213 bool random; /* Sort by random hash of key. */
214 bool general_numeric; /* Flag for general, numeric comparison.
215 Handle numbers in exponential notation. */
216 bool human_numeric; /* Flag for sorting by human readable
217 units with either SI or IEC prefixes. */
218 bool month; /* Flag for comparison by month name. */
219 bool reverse; /* Reverse the sense of comparison. */
220 bool version; /* sort by version number */
221 bool traditional_used; /* Traditional key option format is used. */
222 struct keyfield *next; /* Next keyfield to try. */
225 struct month
227 char const *name;
228 int val;
231 /* Binary merge tree node. */
232 struct merge_node
234 struct line *lo; /* Lines to merge from LO child node. */
235 struct line *hi; /* Lines to merge from HI child node. */
236 struct line *end_lo; /* End of available lines from LO. */
237 struct line *end_hi; /* End of available lines from HI. */
238 struct line **dest; /* Pointer to destination of merge. */
239 size_t nlo; /* Total Lines remaining from LO. */
240 size_t nhi; /* Total lines remaining from HI. */
241 struct merge_node *parent; /* Parent node. */
242 struct merge_node *lo_child; /* LO child node. */
243 struct merge_node *hi_child; /* HI child node. */
244 unsigned int level; /* Level in merge tree. */
245 bool queued; /* Node is already in heap. */
246 pthread_mutex_t lock; /* Lock for node operations. */
249 /* Priority queue of merge nodes. */
250 struct merge_node_queue
252 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
253 pthread_mutex_t mutex; /* Lock for queue operations. */
254 pthread_cond_t cond; /* Conditional wait for empty queue to populate
255 when popping. */
258 /* Used to implement --unique (-u). */
259 static struct line saved_line;
261 /* FIXME: None of these tables work with multibyte character sets.
262 Also, there are many other bugs when handling multibyte characters.
263 One way to fix this is to rewrite 'sort' to use wide characters
264 internally, but doing this with good performance is a bit
265 tricky. */
267 /* Table of blanks. */
268 static bool blanks[UCHAR_LIM];
270 /* Table of non-printing characters. */
271 static bool nonprinting[UCHAR_LIM];
273 /* Table of non-dictionary characters (not letters, digits, or blanks). */
274 static bool nondictionary[UCHAR_LIM];
276 /* Translation table folding lower case to upper. */
277 static char fold_toupper[UCHAR_LIM];
279 #define MONTHS_PER_YEAR 12
281 /* Table mapping month names to integers.
282 Alphabetic order allows binary search. */
283 static struct month monthtab[] =
285 {"APR", 4},
286 {"AUG", 8},
287 {"DEC", 12},
288 {"FEB", 2},
289 {"JAN", 1},
290 {"JUL", 7},
291 {"JUN", 6},
292 {"MAR", 3},
293 {"MAY", 5},
294 {"NOV", 11},
295 {"OCT", 10},
296 {"SEP", 9}
299 /* During the merge phase, the number of files to merge at once. */
300 #define NMERGE_DEFAULT 16
302 /* Minimum size for a merge or check buffer. */
303 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
305 /* Minimum sort size; the code might not work with smaller sizes. */
306 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
308 /* The number of bytes needed for a merge or check buffer, which can
309 function relatively efficiently even if it holds only one line. If
310 a longer line is seen, this value is increased. */
311 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
313 /* The approximate maximum number of bytes of main memory to use, as
314 specified by the user. Zero if the user has not specified a size. */
315 static size_t sort_size;
317 /* The initial allocation factor for non-regular files.
318 This is used, e.g., when reading from a pipe.
319 Don't make it too big, since it is multiplied by ~130 to
320 obtain the size of the actual buffer sort will allocate.
321 Also, there may be 8 threads all doing this at the same time. */
322 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
324 /* Array of directory names in which any temporary files are to be created. */
325 static char const **temp_dirs;
327 /* Number of temporary directory names used. */
328 static size_t temp_dir_count;
330 /* Number of allocated slots in temp_dirs. */
331 static size_t temp_dir_alloc;
333 /* Flag to reverse the order of all comparisons. */
334 static bool reverse;
336 /* Flag for stable sort. This turns off the last ditch bytewise
337 comparison of lines, and instead leaves lines in the same order
338 they were read if all keys compare equal. */
339 static bool stable;
341 /* An int value outside char range. */
342 enum { NON_CHAR = CHAR_MAX + 1 };
344 /* If TAB has this value, blanks separate fields. */
345 enum { TAB_DEFAULT = CHAR_MAX + 1 };
347 /* Tab character separating fields. If TAB_DEFAULT, then fields are
348 separated by the empty string between a non-blank character and a blank
349 character. */
350 static int tab = TAB_DEFAULT;
352 /* Flag to remove consecutive duplicate lines from the output.
353 Only the last of a sequence of equal lines will be output. */
354 static bool unique;
356 /* Nonzero if any of the input files are the standard input. */
357 static bool have_read_stdin;
359 /* List of key field comparisons to be tried. */
360 static struct keyfield *keylist;
362 /* Program used to (de)compress temp files. Must accept -d. */
363 static char const *compress_program;
365 /* Annotate the output with extra info to aid the user. */
366 static bool debug;
368 /* Maximum number of files to merge in one go. If more than this
369 number are present, temp files will be used. */
370 static unsigned int nmerge = NMERGE_DEFAULT;
372 /* Output an error to stderr and exit using async-signal-safe routines.
373 This can be used safely from signal handlers,
374 and between fork and exec of multithreaded processes. */
376 static _Noreturn void
377 async_safe_die (int errnum, char const *errstr)
379 ignore_value (write (STDERR_FILENO, errstr, strlen (errstr)));
381 /* Even if defined HAVE_STRERROR_R, we can't use it,
382 as it may return a translated string etc. and even if not
383 may call malloc which is unsafe. We might improve this
384 by testing for sys_errlist and using that if available.
385 For now just report the error number. */
386 if (errnum)
388 char errbuf[INT_BUFSIZE_BOUND (errnum)];
389 char *p = inttostr (errnum, errbuf);
390 ignore_value (write (STDERR_FILENO, ": errno ", 8));
391 ignore_value (write (STDERR_FILENO, p, strlen (p)));
394 ignore_value (write (STDERR_FILENO, "\n", 1));
396 _exit (SORT_FAILURE);
399 /* Report MESSAGE for FILE, then clean up and exit.
400 If FILE is null, it represents standard output. */
402 static void
403 sort_die (char const *message, char const *file)
405 error (SORT_FAILURE, errno, "%s: %s", message,
406 quotef (file ? file : _("standard output")));
409 void
410 usage (int status)
412 if (status != EXIT_SUCCESS)
413 emit_try_help ();
414 else
416 printf (_("\
417 Usage: %s [OPTION]... [FILE]...\n\
418 or: %s [OPTION]... --files0-from=F\n\
420 program_name, program_name);
421 fputs (_("\
422 Write sorted concatenation of all FILE(s) to standard output.\n\
423 "), stdout);
425 emit_stdin_note ();
426 emit_mandatory_arg_note ();
428 fputs (_("\
429 Ordering options:\n\
431 "), stdout);
432 fputs (_("\
433 -b, --ignore-leading-blanks ignore leading blanks\n\
434 -d, --dictionary-order consider only blanks and alphanumeric characters\
436 -f, --ignore-case fold lower case to upper case characters\n\
437 "), stdout);
438 fputs (_("\
439 -g, --general-numeric-sort compare according to general numerical value\n\
440 -i, --ignore-nonprinting consider only printable characters\n\
441 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
442 "), stdout);
443 fputs (_("\
444 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
445 "), stdout);
446 fputs (_("\
447 -n, --numeric-sort compare according to string numerical value;\n\
448 see manual for which strings are supported\n\
449 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
450 --random-source=FILE get random bytes from FILE\n\
451 -r, --reverse reverse the result of comparisons\n\
452 "), stdout);
453 fputs (_("\
454 --sort=WORD sort according to WORD:\n\
455 general-numeric -g, human-numeric -h, month -M,\
457 numeric -n, random -R, version -V\n\
458 -V, --version-sort natural sort of (version) numbers within text\n\
460 "), stdout);
461 fputs (_("\
462 Other options:\n\
464 "), stdout);
465 fputs (_("\
466 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
467 for more use temp files\n\
468 "), stdout);
469 fputs (_("\
470 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
471 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
473 --compress-program=PROG compress temporaries with PROG;\n\
474 decompress them with PROG -d\n\
475 "), stdout);
476 fputs (_("\
477 --debug annotate the part of the line used to sort,\n\
478 and warn about questionable usage to stderr\n\
479 --files0-from=F read input from the files specified by\n\
480 NUL-terminated names in file F;\n\
481 If F is - then read names from standard input\n\
482 "), stdout);
483 fputs (_("\
484 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
485 -m, --merge merge already sorted files; do not sort\n\
486 "), stdout);
487 fputs (_("\
488 -o, --output=FILE write result to FILE instead of standard output\n\
489 -s, --stable stabilize sort by disabling last-resort comparison\
491 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
492 "), stdout);
493 printf (_("\
494 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
495 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
496 multiple options specify multiple directories\n\
497 --parallel=N change the number of sorts run concurrently to N\n\
498 -u, --unique with -c, check for strict ordering;\n\
499 without -c, output only the first of an equal run\
501 "), DEFAULT_TMPDIR);
502 fputs (_("\
503 -z, --zero-terminated line delimiter is NUL, not newline\n\
504 "), stdout);
505 fputs (HELP_OPTION_DESCRIPTION, stdout);
506 fputs (VERSION_OPTION_DESCRIPTION, stdout);
507 fputs (_("\
509 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
510 field number and C a character position in the field; both are origin 1, and\n\
511 the stop position defaults to the line's end. If neither -t nor -b is in\n\
512 effect, characters in a field are counted from the beginning of the preceding\n\
513 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
515 which override global ordering options for that key. If no key is given, use\n\
516 the entire line as the key. Use --debug to diagnose incorrect key usage.\n\
518 SIZE may be followed by the following multiplicative suffixes:\n\
519 "), stdout);
520 fputs (_("\
521 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y, R, Q.\
522 \n\n\
523 *** WARNING ***\n\
524 The locale specified by the environment affects sort order.\n\
525 Set LC_ALL=C to get the traditional sort order that uses\n\
526 native byte values.\n\
527 "), stdout );
528 emit_ancillary_info (PROGRAM_NAME);
531 exit (status);
534 /* For long options that have no equivalent short option, use a
535 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
536 enum
538 CHECK_OPTION = CHAR_MAX + 1,
539 COMPRESS_PROGRAM_OPTION,
540 DEBUG_PROGRAM_OPTION,
541 FILES0_FROM_OPTION,
542 NMERGE_OPTION,
543 RANDOM_SOURCE_OPTION,
544 SORT_OPTION,
545 PARALLEL_OPTION
548 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
550 static struct option const long_options[] =
552 {"ignore-leading-blanks", no_argument, nullptr, 'b'},
553 {"check", optional_argument, nullptr, CHECK_OPTION},
554 {"compress-program", required_argument, nullptr, COMPRESS_PROGRAM_OPTION},
555 {"debug", no_argument, nullptr, DEBUG_PROGRAM_OPTION},
556 {"dictionary-order", no_argument, nullptr, 'd'},
557 {"ignore-case", no_argument, nullptr, 'f'},
558 {"files0-from", required_argument, nullptr, FILES0_FROM_OPTION},
559 {"general-numeric-sort", no_argument, nullptr, 'g'},
560 {"ignore-nonprinting", no_argument, nullptr, 'i'},
561 {"key", required_argument, nullptr, 'k'},
562 {"merge", no_argument, nullptr, 'm'},
563 {"month-sort", no_argument, nullptr, 'M'},
564 {"numeric-sort", no_argument, nullptr, 'n'},
565 {"human-numeric-sort", no_argument, nullptr, 'h'},
566 {"version-sort", no_argument, nullptr, 'V'},
567 {"random-sort", no_argument, nullptr, 'R'},
568 {"random-source", required_argument, nullptr, RANDOM_SOURCE_OPTION},
569 {"sort", required_argument, nullptr, SORT_OPTION},
570 {"output", required_argument, nullptr, 'o'},
571 {"reverse", no_argument, nullptr, 'r'},
572 {"stable", no_argument, nullptr, 's'},
573 {"batch-size", required_argument, nullptr, NMERGE_OPTION},
574 {"buffer-size", required_argument, nullptr, 'S'},
575 {"field-separator", required_argument, nullptr, 't'},
576 {"temporary-directory", required_argument, nullptr, 'T'},
577 {"unique", no_argument, nullptr, 'u'},
578 {"zero-terminated", no_argument, nullptr, 'z'},
579 {"parallel", required_argument, nullptr, PARALLEL_OPTION},
580 {GETOPT_HELP_OPTION_DECL},
581 {GETOPT_VERSION_OPTION_DECL},
582 {nullptr, 0, nullptr, 0},
585 #define CHECK_TABLE \
586 _ct_("quiet", 'C') \
587 _ct_("silent", 'C') \
588 _ct_("diagnose-first", 'c')
590 static char const *const check_args[] =
592 #define _ct_(_s, _c) _s,
593 CHECK_TABLE nullptr
594 #undef _ct_
596 static char const check_types[] =
598 #define _ct_(_s, _c) _c,
599 CHECK_TABLE
600 #undef _ct_
603 #define SORT_TABLE \
604 _st_("general-numeric", 'g') \
605 _st_("human-numeric", 'h') \
606 _st_("month", 'M') \
607 _st_("numeric", 'n') \
608 _st_("random", 'R') \
609 _st_("version", 'V')
611 static char const *const sort_args[] =
613 #define _st_(_s, _c) _s,
614 SORT_TABLE nullptr
615 #undef _st_
617 static char const sort_types[] =
619 #define _st_(_s, _c) _c,
620 SORT_TABLE
621 #undef _st_
624 /* The set of signals that are caught. */
625 static sigset_t caught_signals;
627 /* Critical section status. */
628 struct cs_status
630 bool valid;
631 sigset_t sigs;
634 /* Enter a critical section. */
635 static void
636 cs_enter (struct cs_status *status)
638 int ret = pthread_sigmask (SIG_BLOCK, &caught_signals, &status->sigs);
639 status->valid = ret == 0;
642 /* Leave a critical section. */
643 static void
644 cs_leave (struct cs_status const *status)
646 if (status->valid)
648 /* Ignore failure when restoring the signal mask. */
649 pthread_sigmask (SIG_SETMASK, &status->sigs, nullptr);
653 /* Possible states for a temp file. If compressed, the file's status
654 is unreaped or reaped, depending on whether 'sort' has waited for
655 the subprocess to finish. */
656 enum { UNCOMPRESSED, UNREAPED, REAPED };
658 /* The list of temporary files. */
659 struct tempnode
661 struct tempnode *volatile next;
662 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
663 char state;
664 char name[FLEXIBLE_ARRAY_MEMBER];
666 static struct tempnode *volatile temphead;
667 static struct tempnode *volatile *temptail = &temphead;
669 /* A file to be sorted. */
670 struct sortfile
672 /* The file's name. */
673 char const *name;
675 /* Non-null if this is a temporary file, in which case NAME == TEMP->name. */
676 struct tempnode *temp;
679 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
680 static Hash_table *proctab;
682 enum { INIT_PROCTAB_SIZE = 47 };
684 static size_t
685 proctab_hasher (void const *entry, size_t tabsize)
687 struct tempnode const *node = entry;
688 return node->pid % tabsize;
691 static bool
692 proctab_comparator (void const *e1, void const *e2)
694 struct tempnode const *n1 = e1;
695 struct tempnode const *n2 = e2;
696 return n1->pid == n2->pid;
699 /* The number of unreaped child processes. */
700 static pid_t nprocs;
702 static bool delete_proc (pid_t);
704 /* If PID is positive, wait for the child process with that PID to
705 exit, and assume that PID has already been removed from the process
706 table. If PID is 0 or -1, clean up some child that has exited (by
707 waiting for it, and removing it from the proc table) and return the
708 child's process ID. However, if PID is 0 and no children have
709 exited, return 0 without waiting. */
711 static pid_t
712 reap (pid_t pid)
714 int status;
715 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
717 if (cpid < 0)
718 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
719 quoteaf (compress_program));
720 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
722 if (! WIFEXITED (status) || WEXITSTATUS (status))
723 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
724 quoteaf (compress_program));
725 --nprocs;
728 return cpid;
731 /* TEMP represents a new process; add it to the process table. Create
732 the process table the first time it's called. */
734 static void
735 register_proc (struct tempnode *temp)
737 if (! proctab)
739 proctab = hash_initialize (INIT_PROCTAB_SIZE, nullptr,
740 proctab_hasher,
741 proctab_comparator,
742 nullptr);
743 if (! proctab)
744 xalloc_die ();
747 temp->state = UNREAPED;
749 if (! hash_insert (proctab, temp))
750 xalloc_die ();
753 /* If PID is in the process table, remove it and return true.
754 Otherwise, return false. */
756 static bool
757 delete_proc (pid_t pid)
759 struct tempnode test;
761 test.pid = pid;
762 struct tempnode *node = hash_remove (proctab, &test);
763 if (! node)
764 return false;
765 node->state = REAPED;
766 return true;
769 /* Remove PID from the process table, and wait for it to exit if it
770 hasn't already. */
772 static void
773 wait_proc (pid_t pid)
775 if (delete_proc (pid))
776 reap (pid);
779 /* Reap any exited children. Do not block; reap only those that have
780 already exited. */
782 static void
783 reap_exited (void)
785 while (0 < nprocs && reap (0))
786 continue;
789 /* Reap at least one exited child, waiting if necessary. */
791 static void
792 reap_some (void)
794 reap (-1);
795 reap_exited ();
798 /* Reap all children, waiting if necessary. */
800 static void
801 reap_all (void)
803 while (0 < nprocs)
804 reap (-1);
807 /* Clean up any remaining temporary files. */
809 static void
810 cleanup (void)
812 struct tempnode const *node;
814 for (node = temphead; node; node = node->next)
815 unlink (node->name);
816 temphead = nullptr;
819 /* Cleanup actions to take when exiting. */
821 static void
822 exit_cleanup (void)
824 if (temphead)
826 /* Clean up any remaining temporary files in a critical section so
827 that a signal handler does not try to clean them too. */
828 struct cs_status cs;
829 cs_enter (&cs);
830 cleanup ();
831 cs_leave (&cs);
834 close_stdout ();
837 /* Create a new temporary file, returning its newly allocated tempnode.
838 Store into *PFD the file descriptor open for writing.
839 If the creation fails, return nullptr and store -1 into *PFD if the
840 failure is due to file descriptor exhaustion and
841 SURVIVE_FD_EXHAUSTION; otherwise, die. */
843 static struct tempnode *
844 create_temp_file (int *pfd, bool survive_fd_exhaustion)
846 static char const slashbase[] = "/sortXXXXXX";
847 static size_t temp_dir_index;
848 int fd;
849 int saved_errno;
850 char const *temp_dir = temp_dirs[temp_dir_index];
851 size_t len = strlen (temp_dir);
852 struct tempnode *node =
853 xmalloc (FLEXSIZEOF (struct tempnode, name, len + sizeof slashbase));
854 char *file = node->name;
855 struct cs_status cs;
857 memcpy (file, temp_dir, len);
858 memcpy (file + len, slashbase, sizeof slashbase);
859 node->next = nullptr;
860 if (++temp_dir_index == temp_dir_count)
861 temp_dir_index = 0;
863 /* Create the temporary file in a critical section, to avoid races. */
864 cs_enter (&cs);
865 fd = mkostemp (file, O_CLOEXEC);
866 if (0 <= fd)
868 *temptail = node;
869 temptail = &node->next;
871 saved_errno = errno;
872 cs_leave (&cs);
873 errno = saved_errno;
875 if (fd < 0)
877 if (! (survive_fd_exhaustion && errno == EMFILE))
878 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
879 quoteaf (temp_dir));
880 free (node);
881 node = nullptr;
884 *pfd = fd;
885 return node;
888 /* Return a pointer to stdout status, or nullptr on failure. */
890 static struct stat *
891 get_outstatus (void)
893 static int outstat_errno;
894 static struct stat outstat;
895 if (outstat_errno == 0)
896 outstat_errno = fstat (STDOUT_FILENO, &outstat) == 0 ? -1 : errno;
897 return outstat_errno < 0 ? &outstat : nullptr;
900 /* Return a stream for FILE, opened with mode HOW. If HOW is "w",
901 the file is already open on standard output, and needs to be
902 truncated unless FILE is null. When opening for input, "-"
903 means standard input. To avoid confusion, do not return file
904 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
905 opening an ordinary FILE. Return nullptr if unsuccessful.
907 Use fadvise to specify an access pattern for input files.
908 There are a few hints we could possibly provide,
909 and after careful testing it was decided that
910 specifying FADVISE_SEQUENTIAL was not detrimental
911 to any cases. On Linux 2.6.31, this option doubles
912 the size of read ahead performed and thus was seen to
913 benefit these cases:
914 Merging
915 Sorting with a smaller internal buffer
916 Reading from faster flash devices
918 In _addition_ one could also specify other hints...
920 FADVISE_WILLNEED was tested, but Linux 2.6.31
921 at least uses that to _synchronously_ prepopulate the cache
922 with the specified range. While sort does need to
923 read all of its input before outputting, a synchronous
924 read of the whole file up front precludes any processing
925 that sort could do in parallel with the system doing
926 read ahead of the data. This was seen to have negative effects
927 in a couple of cases:
928 Merging
929 Sorting with a smaller internal buffer
930 This option was seen to shorten the runtime for sort
931 on a multicore system with lots of RAM and other processes
932 competing for CPU. It could be argued that more explicit
933 scheduling hints with 'nice' et. al. are more appropriate
934 for this situation.
936 FADVISE_NOREUSE is a possibility as it could lower
937 the priority of input data in the cache as sort will
938 only need to process it once. However its functionality
939 has changed over Linux kernel versions and as of 2.6.31
940 it does nothing and thus we can't depend on what it might
941 do in future.
943 FADVISE_DONTNEED is not appropriate for user specified
944 input files, but for temp files we do want to drop the
945 cache immediately after processing. This is done implicitly
946 however when the files are unlinked. */
948 static FILE *
949 stream_open (char const *file, char const *how)
951 FILE *fp;
953 if (*how == 'r')
955 if (STREQ (file, "-"))
957 have_read_stdin = true;
958 fp = stdin;
960 else
962 int fd = open (file, O_RDONLY | O_CLOEXEC);
963 fp = fd < 0 ? nullptr : fdopen (fd, how);
965 fadvise (fp, FADVISE_SEQUENTIAL);
967 else if (*how == 'w')
969 if (file && ftruncate (STDOUT_FILENO, 0) != 0)
971 int ftruncate_errno = errno;
972 struct stat *outst = get_outstatus ();
973 if (!outst || S_ISREG (outst->st_mode) || S_TYPEISSHM (outst))
974 error (SORT_FAILURE, ftruncate_errno, _("%s: error truncating"),
975 quotef (file));
977 fp = stdout;
979 else
980 affirm (!"unexpected mode passed to stream_open");
982 return fp;
985 /* Same as stream_open, except always return a non-null value; die on
986 failure. */
988 static FILE *
989 xfopen (char const *file, char const *how)
991 FILE *fp = stream_open (file, how);
992 if (!fp)
993 sort_die (_("open failed"), file);
994 return fp;
997 /* Close FP, whose name is FILE, and report any errors. */
999 static void
1000 xfclose (FILE *fp, char const *file)
1002 switch (fileno (fp))
1004 case STDIN_FILENO:
1005 /* Allow reading stdin from tty more than once. */
1006 clearerr (fp);
1007 break;
1009 case STDOUT_FILENO:
1010 /* Don't close stdout just yet. close_stdout does that. */
1011 if (fflush (fp) != 0)
1012 sort_die (_("fflush failed"), file);
1013 break;
1015 default:
1016 if (fclose (fp) != 0)
1017 sort_die (_("close failed"), file);
1018 break;
1022 /* Move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not close-on-exec. */
1024 static void
1025 move_fd (int oldfd, int newfd)
1027 if (oldfd != newfd)
1029 /* These should never fail for our usage. */
1030 ignore_value (dup2 (oldfd, newfd));
1031 ignore_value (close (oldfd));
1035 /* Fork a child process for piping to and do common cleanup. The
1036 TRIES parameter specifies how many times to try to fork before
1037 giving up. Return the PID of the child, or -1 (setting errno)
1038 on failure. */
1040 static pid_t
1041 pipe_fork (int pipefds[2], size_t tries)
1043 #if HAVE_WORKING_FORK
1044 struct tempnode *saved_temphead;
1045 int saved_errno;
1046 double wait_retry = 0.25;
1047 pid_t pid;
1048 struct cs_status cs;
1050 if (pipe2 (pipefds, O_CLOEXEC) < 0)
1051 return -1;
1053 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1054 uncontrolled subprocess generation can hurt performance significantly.
1055 Allow at most NMERGE + 2 subprocesses, on the theory that there
1056 may be some useful parallelism by letting compression for the
1057 previous merge finish (1 subprocess) in parallel with the current
1058 merge (NMERGE + 1 subprocesses). */
1060 if (nmerge + 1 < nprocs)
1061 reap_some ();
1063 while (tries--)
1065 /* This is so the child process won't delete our temp files
1066 if it receives a signal before exec-ing. */
1067 cs_enter (&cs);
1068 saved_temphead = temphead;
1069 temphead = nullptr;
1071 pid = fork ();
1072 saved_errno = errno;
1073 if (pid)
1074 temphead = saved_temphead;
1076 cs_leave (&cs);
1077 errno = saved_errno;
1079 if (0 <= pid || errno != EAGAIN)
1080 break;
1081 else
1083 xnanosleep (wait_retry);
1084 wait_retry *= 2;
1085 reap_exited ();
1089 if (pid < 0)
1091 saved_errno = errno;
1092 close (pipefds[0]);
1093 close (pipefds[1]);
1094 errno = saved_errno;
1096 else if (pid == 0)
1098 close (STDIN_FILENO);
1099 close (STDOUT_FILENO);
1101 else
1102 ++nprocs;
1104 return pid;
1106 #else /* ! HAVE_WORKING_FORK */
1107 return -1;
1108 #endif
1111 /* Create a temporary file and, if asked for, start a compressor
1112 to that file. Set *PFP to the file handle and return
1113 the address of the new temp node. If the creation
1114 fails, return nullptr if the failure is due to file descriptor
1115 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1117 static struct tempnode *
1118 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1120 int tempfd;
1121 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1122 if (! node)
1123 return nullptr;
1125 node->state = UNCOMPRESSED;
1127 if (compress_program)
1129 int pipefds[2];
1131 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1132 if (0 < node->pid)
1134 close (tempfd);
1135 close (pipefds[0]);
1136 tempfd = pipefds[1];
1138 register_proc (node);
1140 else if (node->pid == 0)
1142 /* Being the child of a multithreaded program before exec,
1143 we're restricted to calling async-signal-safe routines here. */
1144 close (pipefds[1]);
1145 move_fd (tempfd, STDOUT_FILENO);
1146 move_fd (pipefds[0], STDIN_FILENO);
1148 execlp (compress_program, compress_program, (char *) nullptr);
1150 async_safe_die (errno, "couldn't execute compress program");
1154 *pfp = fdopen (tempfd, "w");
1155 if (! *pfp)
1156 sort_die (_("couldn't create temporary file"), node->name);
1158 return node;
1161 /* Create a temporary file and, if asked for, start a compressor
1162 to that file. Set *PFP to the file handle and return the address
1163 of the new temp node. Die on failure. */
1165 static struct tempnode *
1166 create_temp (FILE **pfp)
1168 return maybe_create_temp (pfp, false);
1171 /* Open a compressed temp file and start a decompression process through
1172 which to filter the input. Return nullptr (setting errno to
1173 EMFILE) if we ran out of file descriptors, and die on any other
1174 kind of failure. */
1176 static FILE *
1177 open_temp (struct tempnode *temp)
1179 int tempfd, pipefds[2];
1180 FILE *fp = nullptr;
1182 if (temp->state == UNREAPED)
1183 wait_proc (temp->pid);
1185 tempfd = open (temp->name, O_RDONLY);
1186 if (tempfd < 0)
1187 return nullptr;
1189 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1191 switch (child)
1193 case -1:
1194 if (errno != EMFILE)
1195 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1196 quoteaf (compress_program));
1197 close (tempfd);
1198 errno = EMFILE;
1199 break;
1201 case 0:
1202 /* Being the child of a multithreaded program before exec,
1203 we're restricted to calling async-signal-safe routines here. */
1204 close (pipefds[0]);
1205 move_fd (tempfd, STDIN_FILENO);
1206 move_fd (pipefds[1], STDOUT_FILENO);
1208 execlp (compress_program, compress_program, "-d", (char *) nullptr);
1210 async_safe_die (errno, "couldn't execute compress program (with -d)");
1212 default:
1213 temp->pid = child;
1214 register_proc (temp);
1215 close (tempfd);
1216 close (pipefds[1]);
1218 fp = fdopen (pipefds[0], "r");
1219 if (! fp)
1221 int saved_errno = errno;
1222 close (pipefds[0]);
1223 errno = saved_errno;
1225 break;
1228 return fp;
1231 /* Append DIR to the array of temporary directory names. */
1232 static void
1233 add_temp_dir (char const *dir)
1235 if (temp_dir_count == temp_dir_alloc)
1236 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1238 temp_dirs[temp_dir_count++] = dir;
1241 /* Remove NAME from the list of temporary files. */
1243 static void
1244 zaptemp (char const *name)
1246 struct tempnode *volatile *pnode;
1247 struct tempnode *node;
1248 struct tempnode *next;
1249 int unlink_status;
1250 int unlink_errno = 0;
1251 struct cs_status cs;
1253 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1254 continue;
1256 if (node->state == UNREAPED)
1257 wait_proc (node->pid);
1259 /* Unlink the temporary file in a critical section to avoid races. */
1260 next = node->next;
1261 cs_enter (&cs);
1262 unlink_status = unlink (name);
1263 unlink_errno = errno;
1264 *pnode = next;
1265 cs_leave (&cs);
1267 if (unlink_status != 0)
1268 error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
1269 if (! next)
1270 temptail = pnode;
1271 free (node);
1274 #if HAVE_NL_LANGINFO
1276 static int
1277 struct_month_cmp (void const *m1, void const *m2)
1279 struct month const *month1 = m1;
1280 struct month const *month2 = m2;
1281 return strcmp (month1->name, month2->name);
1284 #endif
1286 /* Initialize the character class tables. */
1288 static void
1289 inittables (void)
1291 size_t i;
1293 for (i = 0; i < UCHAR_LIM; ++i)
1295 blanks[i] = i == '\n' || isblank (i);
1296 nondictionary[i] = ! blanks[i] && ! isalnum (i);
1297 nonprinting[i] = ! isprint (i);
1298 fold_toupper[i] = toupper (i);
1301 #if HAVE_NL_LANGINFO
1302 /* If we're not in the "C" locale, read different names for months. */
1303 if (hard_LC_TIME)
1305 for (i = 0; i < MONTHS_PER_YEAR; i++)
1307 char const *s;
1308 size_t s_len;
1309 size_t j, k;
1310 char *name;
1312 s = nl_langinfo (ABMON_1 + i);
1313 s_len = strlen (s);
1314 monthtab[i].name = name = xmalloc (s_len + 1);
1315 monthtab[i].val = i + 1;
1317 for (j = k = 0; j < s_len; j++)
1318 if (! isblank (to_uchar (s[j])))
1319 name[k++] = fold_toupper[to_uchar (s[j])];
1320 name[k] = '\0';
1322 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1324 #endif
1327 /* Specify how many inputs may be merged at once.
1328 This may be set on the command-line with the
1329 --batch-size option. */
1330 static void
1331 specify_nmerge (int oi, char c, char const *s)
1333 uintmax_t n;
1334 struct rlimit rlimit;
1335 enum strtol_error e = xstrtoumax (s, nullptr, 10, &n, "");
1337 /* Try to find out how many file descriptors we'll be able
1338 to open. We need at least nmerge + 3 (STDIN_FILENO,
1339 STDOUT_FILENO and STDERR_FILENO). */
1340 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1341 ? rlimit.rlim_cur
1342 : OPEN_MAX)
1343 - 3);
1345 if (e == LONGINT_OK)
1347 nmerge = n;
1348 if (nmerge != n)
1349 e = LONGINT_OVERFLOW;
1350 else
1352 if (nmerge < 2)
1354 error (0, 0, _("invalid --%s argument %s"),
1355 long_options[oi].name, quote (s));
1356 error (SORT_FAILURE, 0,
1357 _("minimum --%s argument is %s"),
1358 long_options[oi].name, quote ("2"));
1360 else if (max_nmerge < nmerge)
1362 e = LONGINT_OVERFLOW;
1364 else
1365 return;
1369 if (e == LONGINT_OVERFLOW)
1371 error (0, 0, _("--%s argument %s too large"),
1372 long_options[oi].name, quote (s));
1373 error (SORT_FAILURE, 0,
1374 _("maximum --%s argument with current rlimit is %u"),
1375 long_options[oi].name, max_nmerge);
1377 else
1378 xstrtol_fatal (e, oi, c, long_options, s);
1381 /* Specify the amount of main memory to use when sorting. */
1382 static void
1383 specify_sort_size (int oi, char c, char const *s)
1385 uintmax_t n;
1386 char *suffix;
1387 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPQRtTYZ");
1389 /* The default unit is KiB. */
1390 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1392 if (n <= UINTMAX_MAX / 1024)
1393 n *= 1024;
1394 else
1395 e = LONGINT_OVERFLOW;
1398 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1399 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1400 switch (suffix[0])
1402 case 'b':
1403 e = LONGINT_OK;
1404 break;
1406 case '%':
1408 double mem = physmem_total () * n / 100;
1410 /* Use "<", not "<=", to avoid problems with rounding. */
1411 if (mem < UINTMAX_MAX)
1413 n = mem;
1414 e = LONGINT_OK;
1416 else
1417 e = LONGINT_OVERFLOW;
1419 break;
1422 if (e == LONGINT_OK)
1424 /* If multiple sort sizes are specified, take the maximum, so
1425 that option order does not matter. */
1426 if (n < sort_size)
1427 return;
1429 sort_size = n;
1430 if (sort_size == n)
1432 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1433 return;
1436 e = LONGINT_OVERFLOW;
1439 xstrtol_fatal (e, oi, c, long_options, s);
1442 /* Specify the number of threads to spawn during internal sort. */
1443 static size_t
1444 specify_nthreads (int oi, char c, char const *s)
1446 uintmax_t nthreads;
1447 enum strtol_error e = xstrtoumax (s, nullptr, 10, &nthreads, "");
1448 if (e == LONGINT_OVERFLOW)
1449 return SIZE_MAX;
1450 if (e != LONGINT_OK)
1451 xstrtol_fatal (e, oi, c, long_options, s);
1452 if (SIZE_MAX < nthreads)
1453 nthreads = SIZE_MAX;
1454 if (nthreads == 0)
1455 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1456 return nthreads;
1459 /* Return the default sort size. */
1460 static size_t
1461 default_sort_size (void)
1463 /* Let SIZE be MEM, but no more than the maximum object size,
1464 total memory, or system resource limits. Don't bother to check
1465 for values like RLIM_INFINITY since in practice they are not much
1466 less than SIZE_MAX. */
1467 size_t size = SIZE_MAX;
1468 struct rlimit rlimit;
1469 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1470 size = rlimit.rlim_cur;
1471 #ifdef RLIMIT_AS
1472 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1473 size = rlimit.rlim_cur;
1474 #endif
1476 /* Leave a large safety margin for the above limits, as failure can
1477 occur when they are exceeded. */
1478 size /= 2;
1480 #ifdef RLIMIT_RSS
1481 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1482 Exceeding RSS is not fatal, but can be quite slow. */
1483 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1484 size = rlimit.rlim_cur / 16 * 15;
1485 #endif
1487 /* Let MEM be available memory or 1/8 of total memory, whichever
1488 is greater. */
1489 double avail = physmem_available ();
1490 double total = physmem_total ();
1491 double mem = MAX (avail, total / 8);
1493 /* Leave a 1/4 margin for physical memory. */
1494 if (total * 0.75 < size)
1495 size = total * 0.75;
1497 /* Return the minimum of MEM and SIZE, but no less than
1498 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1499 right when only one argument is floating point. */
1500 if (mem < size)
1501 size = mem;
1502 return MAX (size, MIN_SORT_SIZE);
1505 /* Return the sort buffer size to use with the input files identified
1506 by FPS and FILES, which are alternate names of the same files.
1507 NFILES gives the number of input files; NFPS may be less. Assume
1508 that each input line requires LINE_BYTES extra bytes' worth of line
1509 information. Do not exceed the size bound specified by the user
1510 (or a default size bound, if the user does not specify one). */
1512 static size_t
1513 sort_buffer_size (FILE *const *fps, size_t nfps,
1514 char *const *files, size_t nfiles,
1515 size_t line_bytes)
1517 /* A bound on the input size. If zero, the bound hasn't been
1518 determined yet. */
1519 static size_t size_bound;
1521 /* In the worst case, each input byte is a newline. */
1522 size_t worst_case_per_input_byte = line_bytes + 1;
1524 /* Keep enough room for one extra input line and an extra byte.
1525 This extra room might be needed when preparing to read EOF. */
1526 size_t size = worst_case_per_input_byte + 1;
1528 for (size_t i = 0; i < nfiles; i++)
1530 struct stat st;
1531 off_t file_size;
1532 size_t worst_case;
1534 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1535 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1536 : stat (files[i], &st))
1537 != 0)
1538 sort_die (_("stat failed"), files[i]);
1540 if (usable_st_size (&st) && 0 < st.st_size)
1541 file_size = st.st_size;
1542 else
1544 /* The file has unknown size. If the user specified a sort
1545 buffer size, use that; otherwise, guess the size. */
1546 if (sort_size)
1547 return sort_size;
1548 file_size = INPUT_FILE_SIZE_GUESS;
1551 if (! size_bound)
1553 size_bound = sort_size;
1554 if (! size_bound)
1555 size_bound = default_sort_size ();
1558 /* Add the amount of memory needed to represent the worst case
1559 where the input consists entirely of newlines followed by a
1560 single non-newline. Check for overflow. */
1561 worst_case = file_size * worst_case_per_input_byte + 1;
1562 if (file_size != worst_case / worst_case_per_input_byte
1563 || size_bound - size <= worst_case)
1564 return size_bound;
1565 size += worst_case;
1568 return size;
1571 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1572 must be at least sizeof (struct line). Allocate ALLOC bytes
1573 initially. */
1575 static void
1576 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1578 /* Ensure that the line array is properly aligned. If the desired
1579 size cannot be allocated, repeatedly halve it until allocation
1580 succeeds. The smaller allocation may hurt overall performance,
1581 but that's better than failing. */
1582 while (true)
1584 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1585 buf->buf = malloc (alloc);
1586 if (buf->buf)
1587 break;
1588 alloc /= 2;
1589 if (alloc <= line_bytes + 1)
1590 xalloc_die ();
1593 buf->line_bytes = line_bytes;
1594 buf->alloc = alloc;
1595 buf->used = buf->left = buf->nlines = 0;
1596 buf->eof = false;
1599 /* Return one past the limit of the line array. */
1601 static inline struct line *
1602 buffer_linelim (struct buffer const *buf)
1604 void *linelim = buf->buf + buf->alloc;
1605 return linelim;
1608 /* Return a pointer to the first character of the field specified
1609 by KEY in LINE. */
1611 static char *
1612 begfield (struct line const *line, struct keyfield const *key)
1614 char *ptr = line->text, *lim = ptr + line->length - 1;
1615 size_t sword = key->sword;
1616 size_t schar = key->schar;
1618 /* The leading field separator itself is included in a field when -t
1619 is absent. */
1621 if (tab != TAB_DEFAULT)
1622 while (ptr < lim && sword--)
1624 while (ptr < lim && *ptr != tab)
1625 ++ptr;
1626 if (ptr < lim)
1627 ++ptr;
1629 else
1630 while (ptr < lim && sword--)
1632 while (ptr < lim && blanks[to_uchar (*ptr)])
1633 ++ptr;
1634 while (ptr < lim && !blanks[to_uchar (*ptr)])
1635 ++ptr;
1638 /* If we're ignoring leading blanks when computing the Start
1639 of the field, skip past them here. */
1640 if (key->skipsblanks)
1641 while (ptr < lim && blanks[to_uchar (*ptr)])
1642 ++ptr;
1644 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1645 ptr = MIN (lim, ptr + schar);
1647 return ptr;
1650 /* Return the limit of (a pointer to the first character after) the field
1651 in LINE specified by KEY. */
1653 ATTRIBUTE_PURE
1654 static char *
1655 limfield (struct line const *line, struct keyfield const *key)
1657 char *ptr = line->text, *lim = ptr + line->length - 1;
1658 size_t eword = key->eword, echar = key->echar;
1660 if (echar == 0)
1661 eword++; /* Skip all of end field. */
1663 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1664 whichever comes first. If there are more than EWORD fields, leave
1665 PTR pointing at the beginning of the field having zero-based index,
1666 EWORD. If a delimiter character was specified (via -t), then that
1667 'beginning' is the first character following the delimiting TAB.
1668 Otherwise, leave PTR pointing at the first 'blank' character after
1669 the preceding field. */
1670 if (tab != TAB_DEFAULT)
1671 while (ptr < lim && eword--)
1673 while (ptr < lim && *ptr != tab)
1674 ++ptr;
1675 if (ptr < lim && (eword || echar))
1676 ++ptr;
1678 else
1679 while (ptr < lim && eword--)
1681 while (ptr < lim && blanks[to_uchar (*ptr)])
1682 ++ptr;
1683 while (ptr < lim && !blanks[to_uchar (*ptr)])
1684 ++ptr;
1687 #ifdef POSIX_UNSPECIFIED
1688 /* The following block of code makes GNU sort incompatible with
1689 standard Unix sort, so it's ifdef'd out for now.
1690 The POSIX spec isn't clear on how to interpret this.
1691 FIXME: request clarification.
1693 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1694 Date: Thu, 30 May 96 12:20:41 -0400
1695 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1697 [...]I believe I've found another bug in 'sort'.
1699 $ cat /tmp/sort.in
1700 a b c 2 d
1701 pq rs 1 t
1702 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1703 a b c 2 d
1704 pq rs 1 t
1705 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1706 pq rs 1 t
1707 a b c 2 d
1709 Unix sort produced the answer I expected: sort on the single character
1710 in column 7. GNU sort produced different results, because it disagrees
1711 on the interpretation of the key-end spec "M.N". Unix sort reads this
1712 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1713 "skip M-1 fields, then either N-1 characters or the rest of the current
1714 field, whichever comes first". This extra clause applies only to
1715 key-ends, not key-starts.
1718 /* Make LIM point to the end of (one byte past) the current field. */
1719 if (tab != TAB_DEFAULT)
1721 char *newlim;
1722 newlim = memchr (ptr, tab, lim - ptr);
1723 if (newlim)
1724 lim = newlim;
1726 else
1728 char *newlim;
1729 newlim = ptr;
1730 while (newlim < lim && blanks[to_uchar (*newlim)])
1731 ++newlim;
1732 while (newlim < lim && !blanks[to_uchar (*newlim)])
1733 ++newlim;
1734 lim = newlim;
1736 #endif
1738 if (echar != 0) /* We need to skip over a portion of the end field. */
1740 /* If we're ignoring leading blanks when computing the End
1741 of the field, skip past them here. */
1742 if (key->skipeblanks)
1743 while (ptr < lim && blanks[to_uchar (*ptr)])
1744 ++ptr;
1746 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1747 ptr = MIN (lim, ptr + echar);
1750 return ptr;
1753 /* Fill BUF reading from FP, moving buf->left bytes from the end
1754 of buf->buf to the beginning first. If EOF is reached and the
1755 file wasn't terminated by a newline, supply one. Set up BUF's line
1756 table too. FILE is the name of the file corresponding to FP.
1757 Return true if some input was read. */
1759 static bool
1760 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1762 struct keyfield const *key = keylist;
1763 char eol = eolchar;
1764 size_t line_bytes = buf->line_bytes;
1765 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1767 if (buf->eof)
1768 return false;
1770 if (buf->used != buf->left)
1772 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1773 buf->used = buf->left;
1774 buf->nlines = 0;
1777 while (true)
1779 char *ptr = buf->buf + buf->used;
1780 struct line *linelim = buffer_linelim (buf);
1781 struct line *line = linelim - buf->nlines;
1782 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1783 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1785 while (line_bytes + 1 < avail)
1787 /* Read as many bytes as possible, but do not read so many
1788 bytes that there might not be enough room for the
1789 corresponding line array. The worst case is when the
1790 rest of the input file consists entirely of newlines,
1791 except that the last byte is not a newline. */
1792 size_t readsize = (avail - 1) / (line_bytes + 1);
1793 size_t bytes_read = fread (ptr, 1, readsize, fp);
1794 char *ptrlim = ptr + bytes_read;
1795 char *p;
1796 avail -= bytes_read;
1798 if (bytes_read != readsize)
1800 if (ferror (fp))
1801 sort_die (_("read failed"), file);
1802 if (feof (fp))
1804 buf->eof = true;
1805 if (buf->buf == ptrlim)
1806 return false;
1807 if (line_start != ptrlim && ptrlim[-1] != eol)
1808 *ptrlim++ = eol;
1812 /* Find and record each line in the just-read input. */
1813 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1815 /* Delimit the line with NUL. This eliminates the need to
1816 temporarily replace the last byte with NUL when calling
1817 xmemcoll, which increases performance. */
1818 *p = '\0';
1819 ptr = p + 1;
1820 line--;
1821 line->text = line_start;
1822 line->length = ptr - line_start;
1823 mergesize = MAX (mergesize, line->length);
1824 avail -= line_bytes;
1826 if (key)
1828 /* Precompute the position of the first key for
1829 efficiency. */
1830 line->keylim = (key->eword == SIZE_MAX
1832 : limfield (line, key));
1834 if (key->sword != SIZE_MAX)
1835 line->keybeg = begfield (line, key);
1836 else
1838 if (key->skipsblanks)
1839 while (blanks[to_uchar (*line_start)])
1840 line_start++;
1841 line->keybeg = line_start;
1845 line_start = ptr;
1848 ptr = ptrlim;
1849 if (buf->eof)
1850 break;
1853 buf->used = ptr - buf->buf;
1854 buf->nlines = buffer_linelim (buf) - line;
1855 if (buf->nlines != 0)
1857 buf->left = ptr - line_start;
1858 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1859 return true;
1863 /* The current input line is too long to fit in the buffer.
1864 Increase the buffer size and try again, keeping it properly
1865 aligned. */
1866 size_t line_alloc = buf->alloc / sizeof (struct line);
1867 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1868 buf->alloc = line_alloc * sizeof (struct line);
1873 /* Table that maps characters to order-of-magnitude values. */
1874 static char const unit_order[UCHAR_LIM] =
1876 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1877 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'R' == 82 && 'Q' == 81 \
1878 && 'k' == 107)
1879 /* This initializer syntax works on all C99 hosts. For now, use
1880 it only on non-ASCII hosts, to ease the pain of porting to
1881 pre-C99 ASCII hosts. */
1882 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1883 ['R']=9, ['Q']=10,
1884 ['k']=1,
1885 #else
1886 /* Generate the following table with this command:
1887 perl -e 'my %a=(k=>1,K=>1,M=>2,G=>3,T=>4,P=>5,E=>6,Z=>7,Y=>8,R=>9,Q=>10);
1888 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1889 |fmt */
1890 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1893 0, 0, 0, 1, 0, 2, 0, 0, 5, 10, 9, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901 #endif
1904 /* Traverse number given as *number consisting of digits, thousands_sep, and
1905 decimal_point chars only. Returns the highest digit found in the number,
1906 or '\0' if no digit has been found. Upon return *number points at the
1907 character that immediately follows after the given number. */
1908 static char
1909 traverse_raw_number (char const **number)
1911 char const *p = *number;
1912 char ch;
1913 char max_digit = '\0';
1914 bool ends_with_thousands_sep = false;
1916 /* Scan to end of number.
1917 Decimals or separators not followed by digits stop the scan.
1918 Numbers ending in decimals or separators are thus considered
1919 to be lacking in units.
1920 FIXME: add support for multibyte thousands_sep and decimal_point. */
1922 while (ISDIGIT (ch = *p++))
1924 if (max_digit < ch)
1925 max_digit = ch;
1927 /* Allow to skip only one occurrence of thousands_sep to avoid finding
1928 the unit in the next column in case thousands_sep matches as blank
1929 and is used as column delimiter. */
1930 ends_with_thousands_sep = (*p == thousands_sep);
1931 if (ends_with_thousands_sep)
1932 ++p;
1935 if (ends_with_thousands_sep)
1937 /* thousands_sep not followed by digit is not allowed. */
1938 *number = p - 2;
1939 return max_digit;
1942 if (ch == decimal_point)
1943 while (ISDIGIT (ch = *p++))
1944 if (max_digit < ch)
1945 max_digit = ch;
1947 *number = p - 1;
1948 return max_digit;
1951 /* Return an integer that represents the order of magnitude of the
1952 unit following the number. The number may contain thousands
1953 separators and a decimal point, but it may not contain leading blanks.
1954 Negative numbers get negative orders; zero numbers have a zero order. */
1956 ATTRIBUTE_PURE
1957 static int
1958 find_unit_order (char const *number)
1960 bool minus_sign = (*number == '-');
1961 char const *p = number + minus_sign;
1962 char max_digit = traverse_raw_number (&p);
1963 if ('0' < max_digit)
1965 unsigned char ch = *p;
1966 int order = unit_order[ch];
1967 return (minus_sign ? -order : order);
1969 else
1970 return 0;
1973 /* Compare numbers A and B ending in units with SI or IEC prefixes
1974 <none/unknown> < K/k < M < G < T < P < E < Z < Y < R < Q */
1976 ATTRIBUTE_PURE
1977 static int
1978 human_numcompare (char const *a, char const *b)
1980 while (blanks[to_uchar (*a)])
1981 a++;
1982 while (blanks[to_uchar (*b)])
1983 b++;
1985 int diff = find_unit_order (a) - find_unit_order (b);
1986 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1989 /* Compare strings A and B as numbers without explicitly converting them to
1990 machine numbers. Comparatively slow for short strings, but asymptotically
1991 hideously fast. */
1993 ATTRIBUTE_PURE
1994 static int
1995 numcompare (char const *a, char const *b)
1997 while (blanks[to_uchar (*a)])
1998 a++;
1999 while (blanks[to_uchar (*b)])
2000 b++;
2002 return strnumcmp (a, b, decimal_point, thousands_sep);
2005 static int
2006 nan_compare (long double a, long double b)
2008 char buf[2][sizeof "-nan""()" + CHAR_BIT * sizeof a];
2009 snprintf (buf[0], sizeof buf[0], "%Lf", a);
2010 snprintf (buf[1], sizeof buf[1], "%Lf", b);
2011 return strcmp (buf[0], buf[1]);
2014 static int
2015 general_numcompare (char const *sa, char const *sb)
2017 /* FIXME: maybe add option to try expensive FP conversion
2018 only if A and B can't be compared more cheaply/accurately. */
2020 char *ea;
2021 char *eb;
2022 long double a = strtold (sa, &ea);
2023 long double b = strtold (sb, &eb);
2025 /* Put conversion errors at the start of the collating sequence. */
2026 if (sa == ea)
2027 return sb == eb ? 0 : -1;
2028 if (sb == eb)
2029 return 1;
2031 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
2032 conversion errors but before numbers; sort them by internal
2033 bit-pattern, for lack of a more portable alternative. */
2034 return (a < b ? -1
2035 : a > b ? 1
2036 : a == b ? 0
2037 : b == b ? -1
2038 : a == a ? 1
2039 : nan_compare (a, b));
2042 /* Return an integer in 1..12 of the month name MONTH.
2043 Return 0 if the name in S is not recognized. */
2045 static int
2046 getmonth (char const *month, char **ea)
2048 size_t lo = 0;
2049 size_t hi = MONTHS_PER_YEAR;
2051 while (blanks[to_uchar (*month)])
2052 month++;
2056 size_t ix = (lo + hi) / 2;
2057 char const *m = month;
2058 char const *n = monthtab[ix].name;
2060 for (;; m++, n++)
2062 if (!*n)
2064 if (ea)
2065 *ea = (char *) m;
2066 return monthtab[ix].val;
2068 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2070 hi = ix;
2071 break;
2073 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2075 lo = ix + 1;
2076 break;
2080 while (lo < hi);
2082 return 0;
2085 /* When using the OpenSSL implementation, dynamically link only if -R.
2086 This saves startup time in the usual (sans -R) case. */
2088 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2089 /* In the typical case where md5.h does not #undef HAVE_OPENSSL_MD5,
2090 trick md5.h into declaring and using pointers to functions not functions.
2091 This causes the compiler's -lcrypto option to have no effect,
2092 as sort.o no longer uses any crypto symbols statically. */
2094 # if 14 <= __GNUC__
2095 # pragma GCC diagnostic push
2096 # pragma GCC diagnostic ignored "-Wmissing-variable-declarations"
2097 # endif
2098 # define MD5_Init (*ptr_MD5_Init)
2099 # define MD5_Update (*ptr_MD5_Update)
2100 # define MD5_Final (*ptr_MD5_Final)
2101 #endif
2103 #include "md5.h"
2105 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2106 # if 14 <= __GNUC__
2107 # pragma GCC diagnostic pop
2108 # endif
2109 # include <dlfcn.h>
2111 /* Diagnose a dynamic linking failure. */
2112 static void
2113 link_failure (void)
2115 error (SORT_FAILURE, 0, "%s", dlerror ());
2118 /* Return a function pointer in HANDLE for SYMBOL. */
2119 static void *
2120 symbol_address (void *handle, char const *symbol)
2122 void *address = dlsym (handle, symbol);
2123 if (!address)
2124 link_failure ();
2125 return address;
2127 #endif
2129 /* Dynamically link the crypto library, if it needs linking. */
2130 static void
2131 link_libcrypto (void)
2133 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2134 void *handle = dlopen (LIBCRYPTO_SONAME, RTLD_LAZY | RTLD_GLOBAL);
2135 if (!handle)
2136 link_failure ();
2137 ptr_MD5_Init = symbol_address (handle, "MD5_Init");
2138 ptr_MD5_Update = symbol_address (handle, "MD5_Update");
2139 ptr_MD5_Final = symbol_address (handle, "MD5_Final");
2140 #endif
2143 /* A randomly chosen MD5 state, used for random comparison. */
2144 static struct md5_ctx random_md5_state;
2146 /* Initialize the randomly chosen MD5 state. */
2148 static void
2149 random_md5_state_init (char const *random_source)
2151 unsigned char buf[MD5_DIGEST_SIZE];
2152 struct randread_source *r = randread_new (random_source, sizeof buf);
2153 if (! r)
2154 sort_die (_("open failed"), random_source ? random_source : "getrandom");
2155 randread (r, buf, sizeof buf);
2156 if (randread_free (r) != 0)
2157 sort_die (_("close failed"), random_source);
2158 link_libcrypto ();
2159 md5_init_ctx (&random_md5_state);
2160 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2163 /* This is like strxfrm, except it reports any error and exits. */
2165 static size_t
2166 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2168 errno = 0;
2169 size_t translated_size = strxfrm (dest, src, destsize);
2171 if (errno)
2173 error (0, errno, _("string transformation failed"));
2174 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2175 error (SORT_FAILURE, 0,
2176 _("the original string was %s"),
2177 quotearg_n_style (0, locale_quoting_style, src));
2180 return translated_size;
2183 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2184 using one or more random hash functions. TEXTA[LENA] and
2185 TEXTB[LENB] must be zero. */
2187 static int
2188 compare_random (char *restrict texta, size_t lena,
2189 char *restrict textb, size_t lenb)
2191 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2192 data. This is used to break ties if there is a checksum
2193 collision, and this is good enough given the astronomically low
2194 probability of a collision. */
2195 int xfrm_diff = 0;
2197 char stackbuf[4000];
2198 char *buf = stackbuf;
2199 size_t bufsize = sizeof stackbuf;
2200 void *allocated = nullptr;
2201 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2202 struct md5_ctx s[2];
2203 s[0] = s[1] = random_md5_state;
2205 if (hard_LC_COLLATE)
2207 char const *lima = texta + lena;
2208 char const *limb = textb + lenb;
2210 while (true)
2212 /* Transform the text into the basis of comparison, so that byte
2213 strings that would otherwise considered to be equal are
2214 considered equal here even if their bytes differ.
2216 Each time through this loop, transform one
2217 null-terminated string's worth from TEXTA or from TEXTB
2218 or both. That way, there's no need to store the
2219 transformation of the whole line, if it contains many
2220 null-terminated strings. */
2222 /* Store the transformed data into a big-enough buffer. */
2224 /* A 3X size guess avoids the overhead of calling strxfrm
2225 twice on typical implementations. Don't worry about
2226 size_t overflow, as the guess need not be correct. */
2227 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2228 if (bufsize < guess_bufsize)
2230 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2231 free (allocated);
2232 buf = allocated = malloc (bufsize);
2233 if (! buf)
2235 buf = stackbuf;
2236 bufsize = sizeof stackbuf;
2240 size_t sizea =
2241 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2242 bool a_fits = sizea <= bufsize;
2243 size_t sizeb =
2244 (textb < limb
2245 ? (xstrxfrm ((a_fits ? buf + sizea : nullptr), textb,
2246 (a_fits ? bufsize - sizea : 0))
2247 + 1)
2248 : 0);
2250 if (! (a_fits && sizea + sizeb <= bufsize))
2252 bufsize = sizea + sizeb;
2253 if (bufsize < SIZE_MAX / 3)
2254 bufsize = bufsize * 3 / 2;
2255 free (allocated);
2256 buf = allocated = xmalloc (bufsize);
2257 if (texta < lima)
2258 strxfrm (buf, texta, sizea);
2259 if (textb < limb)
2260 strxfrm (buf + sizea, textb, sizeb);
2263 /* Advance past NULs to the next part of each input string,
2264 exiting the loop if both strings are exhausted. When
2265 exiting the loop, prepare to finish off the tiebreaker
2266 comparison properly. */
2267 if (texta < lima)
2268 texta += strlen (texta) + 1;
2269 if (textb < limb)
2270 textb += strlen (textb) + 1;
2271 if (! (texta < lima || textb < limb))
2273 lena = sizea; texta = buf;
2274 lenb = sizeb; textb = buf + sizea;
2275 break;
2278 /* Accumulate the transformed data in the corresponding
2279 checksums. */
2280 md5_process_bytes (buf, sizea, &s[0]);
2281 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2283 /* Update the tiebreaker comparison of the transformed data. */
2284 if (! xfrm_diff)
2286 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2287 if (! xfrm_diff)
2288 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2293 /* Compute and compare the checksums. */
2294 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2295 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2296 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2298 /* Fall back on the tiebreaker if the checksums collide. */
2299 if (! diff)
2301 if (! xfrm_diff)
2303 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2304 if (! xfrm_diff)
2305 xfrm_diff = (lena > lenb) - (lena < lenb);
2308 diff = xfrm_diff;
2311 free (allocated);
2313 return diff;
2316 /* Return the printable width of the block of memory starting at
2317 TEXT and ending just before LIM, counting each tab as one byte.
2318 FIXME: Should we generally be counting non printable chars? */
2320 static size_t
2321 debug_width (char const *text, char const *lim)
2323 size_t width = mbsnwidth (text, lim - text, 0);
2324 while (text < lim)
2325 width += (*text++ == '\t');
2326 return width;
2329 /* For debug mode, "underline" a key at the
2330 specified offset and screen width. */
2332 static void
2333 mark_key (size_t offset, size_t width)
2335 while (offset--)
2336 putchar (' ');
2338 if (!width)
2339 printf (_("^ no match for key\n"));
2340 else
2343 putchar ('_');
2344 while (--width);
2346 putchar ('\n');
2350 /* Return true if KEY is a numeric key. */
2352 static inline bool
2353 key_numeric (struct keyfield const *key)
2355 return key->numeric || key->general_numeric || key->human_numeric;
2358 /* For LINE, output a debugging line that underlines KEY in LINE.
2359 If KEY is null, underline the whole line. */
2361 static void
2362 debug_key (struct line const *line, struct keyfield const *key)
2364 char *text = line->text;
2365 char *beg = text;
2366 char *lim = text + line->length - 1;
2368 if (key)
2370 if (key->sword != SIZE_MAX)
2371 beg = begfield (line, key);
2372 if (key->eword != SIZE_MAX)
2373 lim = limfield (line, key);
2375 if ((key->skipsblanks && key->sword == SIZE_MAX)
2376 || key->month || key_numeric (key))
2378 char saved = *lim;
2379 *lim = '\0';
2381 while (blanks[to_uchar (*beg)])
2382 beg++;
2384 char *tighter_lim = beg;
2386 if (lim < beg)
2387 tighter_lim = lim;
2388 else if (key->month)
2389 getmonth (beg, &tighter_lim);
2390 else if (key->general_numeric)
2391 ignore_value (strtold (beg, &tighter_lim));
2392 else if (key->numeric || key->human_numeric)
2394 char const *p = beg + (beg < lim && *beg == '-');
2395 char max_digit = traverse_raw_number (&p);
2396 if ('0' <= max_digit)
2398 unsigned char ch = *p;
2399 tighter_lim = (char *) p
2400 + (key->human_numeric && unit_order[ch]);
2403 else
2404 tighter_lim = lim;
2406 *lim = saved;
2407 lim = tighter_lim;
2411 size_t offset = debug_width (text, beg);
2412 size_t width = debug_width (beg, lim);
2413 mark_key (offset, width);
2416 /* Debug LINE by underlining its keys. */
2418 static void
2419 debug_line (struct line const *line)
2421 struct keyfield const *key = keylist;
2424 debug_key (line, key);
2425 while (key && ((key = key->next) || ! (unique || stable)));
2428 /* Return whether sorting options specified for key. */
2430 static bool
2431 default_key_compare (struct keyfield const *key)
2433 return ! (key->ignore
2434 || key->translate
2435 || key->skipsblanks
2436 || key->skipeblanks
2437 || key_numeric (key)
2438 || key->month
2439 || key->version
2440 || key->random
2441 /* || key->reverse */
2445 /* Convert a key to the short options used to specify it. */
2447 static void
2448 key_to_opts (struct keyfield const *key, char *opts)
2450 if (key->skipsblanks || key->skipeblanks)
2451 *opts++ = 'b';/* either disables global -b */
2452 if (key->ignore == nondictionary)
2453 *opts++ = 'd';
2454 if (key->translate)
2455 *opts++ = 'f';
2456 if (key->general_numeric)
2457 *opts++ = 'g';
2458 if (key->human_numeric)
2459 *opts++ = 'h';
2460 if (key->ignore == nonprinting)
2461 *opts++ = 'i';
2462 if (key->month)
2463 *opts++ = 'M';
2464 if (key->numeric)
2465 *opts++ = 'n';
2466 if (key->random)
2467 *opts++ = 'R';
2468 if (key->reverse)
2469 *opts++ = 'r';
2470 if (key->version)
2471 *opts++ = 'V';
2472 *opts = '\0';
2475 /* Output data independent key warnings to stderr. */
2477 static void
2478 key_warnings (struct keyfield const *gkey, bool gkey_only)
2480 struct keyfield const *key;
2481 struct keyfield ugkey = *gkey;
2482 unsigned long keynum = 1;
2483 bool basic_numeric_field = false;
2484 bool general_numeric_field = false;
2485 bool basic_numeric_field_span = false;
2486 bool general_numeric_field_span = false;
2488 for (key = keylist; key; key = key->next, keynum++)
2490 if (key_numeric (key))
2492 if (key->general_numeric)
2493 general_numeric_field = true;
2494 else
2495 basic_numeric_field = true;
2498 if (key->traditional_used)
2500 size_t sword = key->sword;
2501 size_t eword = key->eword;
2502 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2503 /* obsolescent syntax +A.x -B.y is equivalent to:
2504 -k A+1.x+1,B.y (when y = 0)
2505 -k A+1.x+1,B+1.y (when y > 0) */
2506 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2507 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2508 char *po = obuf;
2509 char *pn = nbuf;
2511 if (sword == SIZE_MAX)
2512 sword++;
2514 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2515 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2516 if (key->eword != SIZE_MAX)
2518 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2519 stpcpy (stpcpy (pn, ","),
2520 umaxtostr (eword + 1
2521 + (key->echar == SIZE_MAX), tmp));
2523 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2524 quote_n (0, obuf), quote_n (1, nbuf));
2527 /* Warn about field specs that will never match. */
2528 bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword;
2529 if (zero_width)
2530 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2532 /* Warn about significant leading blanks. */
2533 bool implicit_skip = key_numeric (key) || key->month;
2534 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2535 if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
2536 && ((!key->skipsblanks && !implicit_skip)
2537 || (!key->skipsblanks && key->schar)
2538 || (!key->skipeblanks && key->echar)))
2539 error (0, 0, _("leading blanks are significant in key %lu; "
2540 "consider also specifying 'b'"), keynum);
2542 /* Warn about numeric comparisons spanning fields,
2543 as field delimiters could be interpreted as part
2544 of the number (maybe only in other locales). */
2545 if (!gkey_only && key_numeric (key))
2547 size_t sword = key->sword + 1;
2548 size_t eword = key->eword + 1;
2549 if (!sword)
2550 sword++;
2551 if (!eword || sword < eword)
2553 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2554 keynum);
2555 if (key->general_numeric)
2556 general_numeric_field_span = true;
2557 else
2558 basic_numeric_field_span = true;
2562 /* Flag global options not copied or specified in any key. */
2563 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2564 ugkey.ignore = nullptr;
2565 if (ugkey.translate && (ugkey.translate == key->translate))
2566 ugkey.translate = nullptr;
2567 ugkey.skipsblanks &= !key->skipsblanks;
2568 ugkey.skipeblanks &= !key->skipeblanks;
2569 ugkey.month &= !key->month;
2570 ugkey.numeric &= !key->numeric;
2571 ugkey.general_numeric &= !key->general_numeric;
2572 ugkey.human_numeric &= !key->human_numeric;
2573 ugkey.random &= !key->random;
2574 ugkey.version &= !key->version;
2575 ugkey.reverse &= !key->reverse;
2578 /* Explicitly warn if field delimiters in this locale
2579 don't constrain numbers. */
2580 bool number_locale_warned = false;
2581 if (basic_numeric_field_span)
2583 if (tab == TAB_DEFAULT
2584 ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))
2585 : tab == thousands_sep)
2587 error (0, 0,
2588 _("field separator %s is treated as a "
2589 "group separator in numbers"),
2590 quote (((char []) {thousands_sep, 0})));
2591 number_locale_warned = true;
2594 if (basic_numeric_field_span || general_numeric_field_span)
2596 if (tab == TAB_DEFAULT
2597 ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))
2598 : tab == decimal_point)
2600 error (0, 0,
2601 _("field separator %s is treated as a "
2602 "decimal point in numbers"),
2603 quote (((char []) {decimal_point, 0})));
2604 number_locale_warned = true;
2606 else if (tab == '-')
2608 error (0, 0,
2609 _("field separator %s is treated as a "
2610 "minus sign in numbers"),
2611 quote (((char []) {tab, 0})));
2613 else if (general_numeric_field_span && tab == '+')
2615 error (0, 0,
2616 _("field separator %s is treated as a "
2617 "plus sign in numbers"),
2618 quote (((char []) {tab, 0})));
2622 /* Explicitly indicate the decimal point used in this locale,
2623 as it suggests that robust scripts need to consider
2624 setting the locale when comparing numbers. */
2625 if ((basic_numeric_field || general_numeric_field) && ! number_locale_warned)
2627 error (0, 0,
2628 _("%snumbers use %s as a decimal point in this locale"),
2629 tab == decimal_point ? "" : _("note "),
2630 quote (((char []) {decimal_point, 0})));
2634 if (basic_numeric_field && thousands_sep_ignored)
2636 error (0, 0,
2637 _("the multi-byte number group separator "
2638 "in this locale is not supported"));
2641 /* Warn about ignored global options flagged above.
2642 This clears all flags if UGKEY is the only one in the list. */
2643 if (!default_key_compare (&ugkey)
2644 || (ugkey.reverse && (stable || unique) && keylist))
2646 bool ugkey_reverse = ugkey.reverse;
2647 if (!(stable || unique))
2648 ugkey.reverse = false;
2649 /* The following is too big, but guaranteed to be "big enough". */
2650 char opts[sizeof short_options];
2651 key_to_opts (&ugkey, opts);
2652 error (0, 0,
2653 ngettext ("option '-%s' is ignored",
2654 "options '-%s' are ignored",
2655 select_plural (strlen (opts))), opts);
2656 ugkey.reverse = ugkey_reverse;
2658 if (ugkey.reverse && !(stable || unique) && keylist)
2659 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2662 /* Return either the sense of DIFF or its reverse, depending on REVERSED.
2663 If REVERSED, do not simply negate DIFF as that can mishandle INT_MIN. */
2665 static int
2666 diff_reversed (int diff, bool reversed)
2668 return reversed ? (diff < 0) - (diff > 0) : diff;
2671 /* Compare two lines A and B trying every key in sequence until there
2672 are no more keys or a difference is found. */
2674 static int
2675 keycompare (struct line const *a, struct line const *b)
2677 struct keyfield *key = keylist;
2679 /* For the first iteration only, the key positions have been
2680 precomputed for us. */
2681 char *texta = a->keybeg;
2682 char *textb = b->keybeg;
2683 char *lima = a->keylim;
2684 char *limb = b->keylim;
2686 int diff;
2688 while (true)
2690 char const *translate = key->translate;
2691 bool const *ignore = key->ignore;
2693 /* Treat field ends before field starts as empty fields. */
2694 lima = MAX (texta, lima);
2695 limb = MAX (textb, limb);
2697 /* Find the lengths. */
2698 size_t lena = lima - texta;
2699 size_t lenb = limb - textb;
2701 if (hard_LC_COLLATE || key_numeric (key)
2702 || key->month || key->random || key->version)
2704 /* Ordinarily use the keys in-place, temporarily null-terminated. */
2705 char *ta = texta;
2706 char *tb = textb;
2707 size_t tlena = lena;
2708 size_t tlenb = lenb;
2709 char enda = ta[tlena];
2710 char endb = tb[tlenb];
2712 void *allocated = nullptr;
2713 char stackbuf[4000];
2715 if (ignore || translate)
2717 /* Compute with copies of the keys, which are the result of
2718 translating or ignoring characters, and which need their
2719 own storage. */
2721 size_t i;
2723 /* Allocate space for copies. */
2724 size_t size = lena + 1 + lenb + 1;
2725 if (size <= sizeof stackbuf)
2726 ta = stackbuf;
2727 else
2728 ta = allocated = xmalloc (size);
2729 tb = ta + lena + 1;
2731 /* Put into each copy a version of the key in which the
2732 requested characters are ignored or translated. */
2733 for (tlena = i = 0; i < lena; i++)
2734 if (! (ignore && ignore[to_uchar (texta[i])]))
2735 ta[tlena++] = (translate
2736 ? translate[to_uchar (texta[i])]
2737 : texta[i]);
2739 for (tlenb = i = 0; i < lenb; i++)
2740 if (! (ignore && ignore[to_uchar (textb[i])]))
2741 tb[tlenb++] = (translate
2742 ? translate[to_uchar (textb[i])]
2743 : textb[i]);
2746 ta[tlena] = '\0';
2747 tb[tlenb] = '\0';
2749 if (key->numeric)
2750 diff = numcompare (ta, tb);
2751 else if (key->general_numeric)
2752 diff = general_numcompare (ta, tb);
2753 else if (key->human_numeric)
2754 diff = human_numcompare (ta, tb);
2755 else if (key->month)
2756 diff = getmonth (ta, nullptr) - getmonth (tb, nullptr);
2757 else if (key->random)
2758 diff = compare_random (ta, tlena, tb, tlenb);
2759 else if (key->version)
2760 diff = filenvercmp (ta, tlena, tb, tlenb);
2761 else
2763 /* Locale-dependent string sorting. This is slower than
2764 C-locale sorting, which is implemented below. */
2765 if (tlena == 0)
2766 diff = - NONZERO (tlenb);
2767 else if (tlenb == 0)
2768 diff = 1;
2769 else
2770 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2773 ta[tlena] = enda;
2774 tb[tlenb] = endb;
2776 free (allocated);
2778 else if (ignore)
2780 #define CMP_WITH_IGNORE(A, B) \
2781 do \
2783 while (true) \
2785 while (texta < lima && ignore[to_uchar (*texta)]) \
2786 ++texta; \
2787 while (textb < limb && ignore[to_uchar (*textb)]) \
2788 ++textb; \
2789 if (! (texta < lima && textb < limb)) \
2791 diff = (texta < lima) - (textb < limb); \
2792 break; \
2794 diff = to_uchar (A) - to_uchar (B); \
2795 if (diff) \
2796 break; \
2797 ++texta; \
2798 ++textb; \
2802 while (0)
2804 if (translate)
2805 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2806 translate[to_uchar (*textb)]);
2807 else
2808 CMP_WITH_IGNORE (*texta, *textb);
2810 else
2812 size_t lenmin = MIN (lena, lenb);
2813 if (lenmin == 0)
2814 diff = 0;
2815 else if (translate)
2817 size_t i = 0;
2820 diff = (to_uchar (translate[to_uchar (texta[i])])
2821 - to_uchar (translate[to_uchar (textb[i])]));
2822 if (diff)
2823 break;
2824 i++;
2826 while (i < lenmin);
2828 else
2829 diff = memcmp (texta, textb, lenmin);
2831 if (! diff)
2832 diff = (lena > lenb) - (lena < lenb);
2835 if (diff)
2836 break;
2838 key = key->next;
2839 if (! key)
2840 return 0;
2842 /* Find the beginning and limit of the next field. */
2843 if (key->eword != SIZE_MAX)
2844 lima = limfield (a, key), limb = limfield (b, key);
2845 else
2846 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2848 if (key->sword != SIZE_MAX)
2849 texta = begfield (a, key), textb = begfield (b, key);
2850 else
2852 texta = a->text, textb = b->text;
2853 if (key->skipsblanks)
2855 while (texta < lima && blanks[to_uchar (*texta)])
2856 ++texta;
2857 while (textb < limb && blanks[to_uchar (*textb)])
2858 ++textb;
2863 return diff_reversed (diff, key->reverse);
2866 /* Compare two lines A and B, returning negative, zero, or positive
2867 depending on whether A compares less than, equal to, or greater than B. */
2869 static int
2870 compare (struct line const *a, struct line const *b)
2872 int diff;
2873 size_t alen, blen;
2875 /* First try to compare on the specified keys (if any).
2876 The only two cases with no key at all are unadorned sort,
2877 and unadorned sort -r. */
2878 if (keylist)
2880 diff = keycompare (a, b);
2881 if (diff || unique || stable)
2882 return diff;
2885 /* If the keys all compare equal (or no keys were specified)
2886 fall through to the default comparison. */
2887 alen = a->length - 1, blen = b->length - 1;
2889 if (alen == 0)
2890 diff = - NONZERO (blen);
2891 else if (blen == 0)
2892 diff = 1;
2893 else if (hard_LC_COLLATE)
2895 /* xmemcoll0 is a performance enhancement as
2896 it will not unconditionally write '\0' after the
2897 passed in buffers, which was seen to give around
2898 a 3% increase in performance for short lines. */
2899 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2901 else
2903 diff = memcmp (a->text, b->text, MIN (alen, blen));
2904 if (!diff)
2905 diff = (alen > blen) - (alen < blen);
2908 return diff_reversed (diff, reverse);
2911 /* Write LINE to output stream FP; the output file's name is
2912 OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
2913 otherwise. If debugging is enabled and FP is standard output,
2914 append some debugging information. */
2916 static void
2917 write_line (struct line const *line, FILE *fp, char const *output_file)
2919 char *buf = line->text;
2920 size_t n_bytes = line->length;
2921 char *ebuf = buf + n_bytes;
2923 if (!output_file && debug)
2925 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2926 char const *c = buf;
2928 while (c < ebuf)
2930 char wc = *c++;
2931 if (wc == '\t')
2932 wc = '>';
2933 else if (c == ebuf)
2934 wc = '\n';
2935 if (fputc (wc, fp) == EOF)
2936 sort_die (_("write failed"), output_file);
2939 debug_line (line);
2941 else
2943 ebuf[-1] = eolchar;
2944 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2945 sort_die (_("write failed"), output_file);
2946 ebuf[-1] = '\0';
2950 /* Check that the lines read from FILE_NAME come in order. Return
2951 true if they are in order. If CHECKONLY == 'c', also print a
2952 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2953 they are not in order. */
2955 static bool
2956 check (char const *file_name, char checkonly)
2958 FILE *fp = xfopen (file_name, "r");
2959 struct buffer buf; /* Input buffer. */
2960 struct line temp; /* Copy of previous line. */
2961 size_t alloc = 0;
2962 uintmax_t line_number = 0;
2963 struct keyfield const *key = keylist;
2964 bool nonunique = ! unique;
2965 bool ordered = true;
2967 initbuf (&buf, sizeof (struct line),
2968 MAX (merge_buffer_size, sort_size));
2969 temp.text = nullptr;
2971 while (fillbuf (&buf, fp, file_name))
2973 struct line const *line = buffer_linelim (&buf);
2974 struct line const *linebase = line - buf.nlines;
2976 /* Make sure the line saved from the old buffer contents is
2977 less than or equal to the first line of the new buffer. */
2978 if (alloc && nonunique <= compare (&temp, line - 1))
2980 found_disorder:
2982 if (checkonly == 'c')
2984 struct line const *disorder_line = line - 1;
2985 uintmax_t disorder_line_number =
2986 buffer_linelim (&buf) - disorder_line + line_number;
2987 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2988 fprintf (stderr, _("%s: %s:%s: disorder: "),
2989 program_name, file_name,
2990 umaxtostr (disorder_line_number, hr_buf));
2991 write_line (disorder_line, stderr, _("standard error"));
2994 ordered = false;
2995 break;
2999 /* Compare each line in the buffer with its successor. */
3000 while (linebase < --line)
3001 if (nonunique <= compare (line, line - 1))
3002 goto found_disorder;
3004 line_number += buf.nlines;
3006 /* Save the last line of the buffer. */
3007 if (alloc < line->length)
3011 alloc *= 2;
3012 if (! alloc)
3014 alloc = line->length;
3015 break;
3018 while (alloc < line->length);
3020 free (temp.text);
3021 temp.text = xmalloc (alloc);
3023 memcpy (temp.text, line->text, line->length);
3024 temp.length = line->length;
3025 if (key)
3027 temp.keybeg = temp.text + (line->keybeg - line->text);
3028 temp.keylim = temp.text + (line->keylim - line->text);
3032 xfclose (fp, file_name);
3033 free (buf.buf);
3034 free (temp.text);
3035 return ordered;
3038 /* Open FILES (there are NFILES of them) and store the resulting array
3039 of stream pointers into (*PFPS). Allocate the array. Return the
3040 number of successfully opened files, setting errno if this value is
3041 less than NFILES. */
3043 static size_t
3044 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
3046 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
3047 int i;
3049 /* Open as many input files as we can. */
3050 for (i = 0; i < nfiles; i++)
3052 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
3053 ? open_temp (files[i].temp)
3054 : stream_open (files[i].name, "r"));
3055 if (!fps[i])
3056 break;
3059 return i;
3062 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3063 files (all of which are at the start of the FILES array), and
3064 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3065 FPS is the vector of open stream corresponding to the files.
3066 Close input and output streams before returning.
3067 OUTPUT_FILE gives the name of the output file. If it is null,
3068 the output file is standard output. */
3070 static void
3071 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
3072 FILE *ofp, char const *output_file, FILE **fps)
3074 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
3075 /* Input buffers for each file. */
3076 struct line saved; /* Saved line storage for unique check. */
3077 struct line const *savedline = nullptr;
3078 /* &saved if there is a saved line. */
3079 size_t savealloc = 0; /* Size allocated for the saved line. */
3080 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
3081 /* Current line in each line table. */
3082 struct line const **base = xnmalloc (nfiles, sizeof *base);
3083 /* Base of each line table. */
3084 size_t *ord = xnmalloc (nfiles, sizeof *ord);
3085 /* Table representing a permutation of fps,
3086 such that cur[ord[0]] is the smallest line
3087 and will be next output. */
3088 size_t i;
3089 size_t j;
3090 size_t t;
3091 struct keyfield const *key = keylist;
3092 saved.text = nullptr;
3094 /* Read initial lines from each input file. */
3095 for (i = 0; i < nfiles; )
3097 initbuf (&buffer[i], sizeof (struct line),
3098 MAX (merge_buffer_size, sort_size / nfiles));
3099 if (fillbuf (&buffer[i], fps[i], files[i].name))
3101 struct line const *linelim = buffer_linelim (&buffer[i]);
3102 cur[i] = linelim - 1;
3103 base[i] = linelim - buffer[i].nlines;
3104 i++;
3106 else
3108 /* fps[i] is empty; eliminate it from future consideration. */
3109 xfclose (fps[i], files[i].name);
3110 if (i < ntemps)
3112 ntemps--;
3113 zaptemp (files[i].name);
3115 free (buffer[i].buf);
3116 --nfiles;
3117 for (j = i; j < nfiles; ++j)
3119 files[j] = files[j + 1];
3120 fps[j] = fps[j + 1];
3125 /* Set up the ord table according to comparisons among input lines.
3126 Since this only reorders two items if one is strictly greater than
3127 the other, it is stable. */
3128 for (i = 0; i < nfiles; ++i)
3129 ord[i] = i;
3130 for (i = 1; i < nfiles; ++i)
3131 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
3132 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
3134 /* Repeatedly output the smallest line until no input remains. */
3135 while (nfiles)
3137 struct line const *smallest = cur[ord[0]];
3139 /* If uniquified output is turned on, output only the first of
3140 an identical series of lines. */
3141 if (unique)
3143 if (savedline && compare (savedline, smallest))
3145 savedline = nullptr;
3146 write_line (&saved, ofp, output_file);
3148 if (!savedline)
3150 savedline = &saved;
3151 if (savealloc < smallest->length)
3154 if (! savealloc)
3156 savealloc = smallest->length;
3157 break;
3159 while ((savealloc *= 2) < smallest->length);
3161 free (saved.text);
3162 saved.text = xmalloc (savealloc);
3164 saved.length = smallest->length;
3165 memcpy (saved.text, smallest->text, saved.length);
3166 if (key)
3168 saved.keybeg =
3169 saved.text + (smallest->keybeg - smallest->text);
3170 saved.keylim =
3171 saved.text + (smallest->keylim - smallest->text);
3175 else
3176 write_line (smallest, ofp, output_file);
3178 /* Check if we need to read more lines into memory. */
3179 if (base[ord[0]] < smallest)
3180 cur[ord[0]] = smallest - 1;
3181 else
3183 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
3185 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
3186 cur[ord[0]] = linelim - 1;
3187 base[ord[0]] = linelim - buffer[ord[0]].nlines;
3189 else
3191 /* We reached EOF on fps[ord[0]]. */
3192 for (i = 1; i < nfiles; ++i)
3193 if (ord[i] > ord[0])
3194 --ord[i];
3195 --nfiles;
3196 xfclose (fps[ord[0]], files[ord[0]].name);
3197 if (ord[0] < ntemps)
3199 ntemps--;
3200 zaptemp (files[ord[0]].name);
3202 free (buffer[ord[0]].buf);
3203 for (i = ord[0]; i < nfiles; ++i)
3205 fps[i] = fps[i + 1];
3206 files[i] = files[i + 1];
3207 buffer[i] = buffer[i + 1];
3208 cur[i] = cur[i + 1];
3209 base[i] = base[i + 1];
3211 for (i = 0; i < nfiles; ++i)
3212 ord[i] = ord[i + 1];
3213 continue;
3217 /* The new line just read in may be larger than other lines
3218 already in main memory; push it back in the queue until we
3219 encounter a line larger than it. Optimize for the common
3220 case where the new line is smallest. */
3222 size_t lo = 1;
3223 size_t hi = nfiles;
3224 size_t probe = lo;
3225 size_t ord0 = ord[0];
3226 size_t count_of_smaller_lines;
3228 while (lo < hi)
3230 int cmp = compare (cur[ord0], cur[ord[probe]]);
3231 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3232 hi = probe;
3233 else
3234 lo = probe + 1;
3235 probe = (lo + hi) / 2;
3238 count_of_smaller_lines = lo - 1;
3239 for (j = 0; j < count_of_smaller_lines; j++)
3240 ord[j] = ord[j + 1];
3241 ord[count_of_smaller_lines] = ord0;
3245 if (unique && savedline)
3247 write_line (&saved, ofp, output_file);
3248 free (saved.text);
3251 xfclose (ofp, output_file);
3252 free (fps);
3253 free (buffer);
3254 free (ord);
3255 free (base);
3256 free (cur);
3259 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3260 files (all of which are at the start of the FILES array), and
3261 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3262 Close input and output files before returning.
3263 OUTPUT_FILE gives the name of the output file.
3265 Return the number of files successfully merged. This number can be
3266 less than NFILES if we ran low on file descriptors, but in this
3267 case it is never less than 2. */
3269 static size_t
3270 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3271 FILE *ofp, char const *output_file)
3273 FILE **fps;
3274 size_t nopened = open_input_files (files, nfiles, &fps);
3275 if (nopened < nfiles && nopened < 2)
3276 sort_die (_("open failed"), files[nopened].name);
3277 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3278 return nopened;
3281 /* Merge into T (of size NLINES) the two sorted arrays of lines
3282 LO (with NLINES / 2 members), and
3283 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3284 T and LO point just past their respective arrays, and the arrays
3285 are in reverse order. NLINES must be at least 2. */
3287 static void
3288 mergelines (struct line *restrict t, size_t nlines,
3289 struct line const *restrict lo)
3291 size_t nlo = nlines / 2;
3292 size_t nhi = nlines - nlo;
3293 struct line *hi = t - nlo;
3295 while (true)
3296 if (compare (lo - 1, hi - 1) <= 0)
3298 *--t = *--lo;
3299 if (! --nlo)
3301 /* HI must equal T now, and there is no need to copy from
3302 HI to T. */
3303 return;
3306 else
3308 *--t = *--hi;
3309 if (! --nhi)
3312 *--t = *--lo;
3313 while (--nlo);
3315 return;
3320 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3321 Do this all within one thread. NLINES must be at least 2.
3322 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3323 Otherwise the sort is in-place and TEMP is half-sized.
3324 The input and output arrays are in reverse order, and LINES and
3325 TEMP point just past the end of their respective arrays.
3327 Use a recursive divide-and-conquer algorithm, in the style
3328 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3329 the optimization suggested by exercise 5.2.4-10; this requires room
3330 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3331 writes that this memory optimization was originally published by
3332 D. A. Bell, Comp J. 1 (1958), 75. */
3334 static void
3335 sequential_sort (struct line *restrict lines, size_t nlines,
3336 struct line *restrict temp, bool to_temp)
3338 if (nlines == 2)
3340 /* Declare 'swap' as int, not bool, to work around a bug
3341 <https://lists.gnu.org/r/bug-coreutils/2005-10/msg00086.html>
3342 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3343 int swap = (0 < compare (&lines[-1], &lines[-2]));
3344 if (to_temp)
3346 temp[-1] = lines[-1 - swap];
3347 temp[-2] = lines[-2 + swap];
3349 else if (swap)
3351 temp[-1] = lines[-1];
3352 lines[-1] = lines[-2];
3353 lines[-2] = temp[-1];
3356 else
3358 size_t nlo = nlines / 2;
3359 size_t nhi = nlines - nlo;
3360 struct line *lo = lines;
3361 struct line *hi = lines - nlo;
3363 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3364 if (1 < nlo)
3365 sequential_sort (lo, nlo, temp, !to_temp);
3366 else if (!to_temp)
3367 temp[-1] = lo[-1];
3369 struct line *dest;
3370 struct line const *sorted_lo;
3371 if (to_temp)
3373 dest = temp;
3374 sorted_lo = lines;
3376 else
3378 dest = lines;
3379 sorted_lo = temp;
3381 mergelines (dest, nlines, sorted_lo);
3385 static struct merge_node *init_node (struct merge_node *restrict,
3386 struct merge_node *restrict,
3387 struct line *, size_t, size_t, bool);
3390 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3391 lines, with destination DEST. */
3392 static struct merge_node *
3393 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3395 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3397 struct merge_node *root = merge_tree;
3398 root->lo = root->hi = root->end_lo = root->end_hi = nullptr;
3399 root->dest = nullptr;
3400 root->nlo = root->nhi = nlines;
3401 root->parent = nullptr;
3402 root->level = MERGE_END;
3403 root->queued = false;
3404 pthread_mutex_init (&root->lock, nullptr);
3406 init_node (root, root + 1, dest, nthreads, nlines, false);
3407 return merge_tree;
3410 /* Destroy the merge tree. */
3411 static void
3412 merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
3414 size_t n_nodes = nthreads * 2;
3415 struct merge_node *node = merge_tree;
3417 while (n_nodes--)
3419 pthread_mutex_destroy (&node->lock);
3420 node++;
3423 free (merge_tree);
3426 /* Initialize a merge tree node and its descendants. The node's
3427 parent is PARENT. The node and its descendants are taken from the
3428 array of nodes NODE_POOL. Their destination starts at DEST; they
3429 will consume NTHREADS threads. The total number of sort lines is
3430 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3431 its parent. */
3433 static struct merge_node *
3434 init_node (struct merge_node *restrict parent,
3435 struct merge_node *restrict node_pool,
3436 struct line *dest, size_t nthreads,
3437 size_t total_lines, bool is_lo_child)
3439 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3440 size_t nlo = nlines / 2;
3441 size_t nhi = nlines - nlo;
3442 struct line *lo = dest - total_lines;
3443 struct line *hi = lo - nlo;
3444 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3446 struct merge_node *node = node_pool++;
3447 node->lo = node->end_lo = lo;
3448 node->hi = node->end_hi = hi;
3449 node->dest = parent_end;
3450 node->nlo = nlo;
3451 node->nhi = nhi;
3452 node->parent = parent;
3453 node->level = parent->level + 1;
3454 node->queued = false;
3455 pthread_mutex_init (&node->lock, nullptr);
3457 if (nthreads > 1)
3459 size_t lo_threads = nthreads / 2;
3460 size_t hi_threads = nthreads - lo_threads;
3461 node->lo_child = node_pool;
3462 node_pool = init_node (node, node_pool, lo, lo_threads,
3463 total_lines, true);
3464 node->hi_child = node_pool;
3465 node_pool = init_node (node, node_pool, hi, hi_threads,
3466 total_lines, false);
3468 else
3470 node->lo_child = nullptr;
3471 node->hi_child = nullptr;
3473 return node_pool;
3477 /* Compare two merge nodes A and B for priority. */
3479 static int
3480 compare_nodes (void const *a, void const *b)
3482 struct merge_node const *nodea = a;
3483 struct merge_node const *nodeb = b;
3484 if (nodea->level == nodeb->level)
3485 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3486 return nodea->level < nodeb->level;
3489 /* Lock a merge tree NODE. */
3491 static inline void
3492 lock_node (struct merge_node *node)
3494 pthread_mutex_lock (&node->lock);
3497 /* Unlock a merge tree NODE. */
3499 static inline void
3500 unlock_node (struct merge_node *node)
3502 pthread_mutex_unlock (&node->lock);
3505 /* Destroy merge QUEUE. */
3507 static void
3508 queue_destroy (struct merge_node_queue *queue)
3510 heap_free (queue->priority_queue);
3511 pthread_cond_destroy (&queue->cond);
3512 pthread_mutex_destroy (&queue->mutex);
3515 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3516 NTHREADS threads. */
3518 static void
3519 queue_init (struct merge_node_queue *queue, size_t nthreads)
3521 /* Though it's highly unlikely all nodes are in the heap at the same
3522 time, the heap should accommodate all of them. Counting a null
3523 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3524 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3525 pthread_mutex_init (&queue->mutex, nullptr);
3526 pthread_cond_init (&queue->cond, nullptr);
3529 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3530 does not need to lock NODE. */
3532 static void
3533 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3535 pthread_mutex_lock (&queue->mutex);
3536 heap_insert (queue->priority_queue, node);
3537 node->queued = true;
3538 pthread_cond_signal (&queue->cond);
3539 pthread_mutex_unlock (&queue->mutex);
3542 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3544 static struct merge_node *
3545 queue_pop (struct merge_node_queue *queue)
3547 struct merge_node *node;
3548 pthread_mutex_lock (&queue->mutex);
3549 while (! (node = heap_remove_top (queue->priority_queue)))
3550 pthread_cond_wait (&queue->cond, &queue->mutex);
3551 pthread_mutex_unlock (&queue->mutex);
3552 lock_node (node);
3553 node->queued = false;
3554 return node;
3557 /* Output LINE to TFP, unless -u is specified and the line compares
3558 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3559 is null if TFP is standard output.
3561 This function does not save the line for comparison later, so it is
3562 appropriate only for internal sort. */
3564 static void
3565 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3567 if (unique)
3569 if (saved_line.text && ! compare (line, &saved_line))
3570 return;
3571 saved_line = *line;
3574 write_line (line, tfp, temp_output);
3577 /* Merge the lines currently available to a NODE in the binary
3578 merge tree. Merge a number of lines appropriate for this merge
3579 level, assuming TOTAL_LINES is the total number of lines.
3581 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3582 the name of TFP, or is null if TFP is standard output. */
3584 static void
3585 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3586 FILE *tfp, char const *temp_output)
3588 struct line *lo_orig = node->lo;
3589 struct line *hi_orig = node->hi;
3590 size_t to_merge = MAX_MERGE (total_lines, node->level);
3591 size_t merged_lo;
3592 size_t merged_hi;
3594 if (node->level > MERGE_ROOT)
3596 /* Merge to destination buffer. */
3597 struct line *dest = *node->dest;
3598 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3599 if (compare (node->lo - 1, node->hi - 1) <= 0)
3600 *--dest = *--node->lo;
3601 else
3602 *--dest = *--node->hi;
3604 merged_lo = lo_orig - node->lo;
3605 merged_hi = hi_orig - node->hi;
3607 if (node->nhi == merged_hi)
3608 while (node->lo != node->end_lo && to_merge--)
3609 *--dest = *--node->lo;
3610 else if (node->nlo == merged_lo)
3611 while (node->hi != node->end_hi && to_merge--)
3612 *--dest = *--node->hi;
3613 *node->dest = dest;
3615 else
3617 /* Merge directly to output. */
3618 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3620 if (compare (node->lo - 1, node->hi - 1) <= 0)
3621 write_unique (--node->lo, tfp, temp_output);
3622 else
3623 write_unique (--node->hi, tfp, temp_output);
3626 merged_lo = lo_orig - node->lo;
3627 merged_hi = hi_orig - node->hi;
3629 if (node->nhi == merged_hi)
3631 while (node->lo != node->end_lo && to_merge--)
3632 write_unique (--node->lo, tfp, temp_output);
3634 else if (node->nlo == merged_lo)
3636 while (node->hi != node->end_hi && to_merge--)
3637 write_unique (--node->hi, tfp, temp_output);
3641 /* Update NODE. */
3642 merged_lo = lo_orig - node->lo;
3643 merged_hi = hi_orig - node->hi;
3644 node->nlo -= merged_lo;
3645 node->nhi -= merged_hi;
3648 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3649 NODE's children has available lines and the other either has
3650 available lines or has exhausted its lines. */
3652 static void
3653 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3655 if (! node->queued)
3657 bool lo_avail = (node->lo - node->end_lo) != 0;
3658 bool hi_avail = (node->hi - node->end_hi) != 0;
3659 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3660 queue_insert (queue, node);
3664 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3666 static void
3667 queue_check_insert_parent (struct merge_node_queue *queue,
3668 struct merge_node *node)
3670 if (node->level > MERGE_ROOT)
3672 lock_node (node->parent);
3673 queue_check_insert (queue, node->parent);
3674 unlock_node (node->parent);
3676 else if (node->nlo + node->nhi == 0)
3678 /* If the MERGE_ROOT NODE has finished merging, insert the
3679 MERGE_END node. */
3680 queue_insert (queue, node->parent);
3684 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3685 some of those lines, until the MERGE_END node is popped.
3686 TOTAL_LINES is the total number of lines. If merging at the top
3687 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3688 null if TFP is standard output. */
3690 static void
3691 merge_loop (struct merge_node_queue *queue,
3692 size_t total_lines, FILE *tfp, char const *temp_output)
3694 while (true)
3696 struct merge_node *node = queue_pop (queue);
3698 if (node->level == MERGE_END)
3700 unlock_node (node);
3701 /* Reinsert so other threads can pop it. */
3702 queue_insert (queue, node);
3703 break;
3705 mergelines_node (node, total_lines, tfp, temp_output);
3706 queue_check_insert (queue, node);
3707 queue_check_insert_parent (queue, node);
3709 unlock_node (node);
3714 static void sortlines (struct line *restrict, size_t, size_t,
3715 struct merge_node *, struct merge_node_queue *,
3716 FILE *, char const *);
3718 /* Thread arguments for sortlines_thread. */
3720 struct thread_args
3722 /* Source, i.e., the array of lines to sort. This points just past
3723 the end of the array. */
3724 struct line *lines;
3726 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3727 size_t nthreads;
3729 /* Number of lines in LINES and DEST. */
3730 size_t const total_lines;
3732 /* Merge node. Lines from this node and this node's sibling will merged
3733 to this node's parent. */
3734 struct merge_node *const node;
3736 /* The priority queue controlling available work for the entire
3737 internal sort. */
3738 struct merge_node_queue *const queue;
3740 /* If at the top level, the file to output to, and the file's name.
3741 If the file is standard output, the file's name is null. */
3742 FILE *tfp;
3743 char const *output_temp;
3746 /* Like sortlines, except with a signature acceptable to pthread_create. */
3748 static void *
3749 sortlines_thread (void *data)
3751 struct thread_args const *args = data;
3752 sortlines (args->lines, args->nthreads, args->total_lines,
3753 args->node, args->queue, args->tfp,
3754 args->output_temp);
3755 return nullptr;
3758 /* Sort lines, possibly in parallel. The arguments are as in struct
3759 thread_args above.
3761 The algorithm has three phases: node creation, sequential sort,
3762 and binary merge.
3764 During node creation, sortlines recursively visits each node in the
3765 binary merge tree and creates a NODE structure corresponding to all the
3766 future line merging NODE is responsible for. For each call to
3767 sortlines, half the available threads are assigned to each recursive
3768 call, until a leaf node having only 1 available thread is reached.
3770 Each leaf node then performs two sequential sorts, one on each half of
3771 the lines it is responsible for. It records in its NODE structure that
3772 there are two sorted sublists available to merge from, and inserts its
3773 NODE into the priority queue.
3775 The binary merge phase then begins. Each thread drops into a loop
3776 where the thread retrieves a NODE from the priority queue, merges lines
3777 available to that NODE, and potentially insert NODE or its parent back
3778 into the queue if there are sufficient available lines for them to
3779 merge. This continues until all lines at all nodes of the merge tree
3780 have been merged. */
3782 static void
3783 sortlines (struct line *restrict lines, size_t nthreads,
3784 size_t total_lines, struct merge_node *node,
3785 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3787 size_t nlines = node->nlo + node->nhi;
3789 /* Calculate thread arguments. */
3790 size_t lo_threads = nthreads / 2;
3791 size_t hi_threads = nthreads - lo_threads;
3792 pthread_t thread;
3793 struct thread_args args = {lines, lo_threads, total_lines,
3794 node->lo_child, queue, tfp, temp_output};
3796 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3797 && pthread_create (&thread, nullptr, sortlines_thread, &args) == 0)
3799 sortlines (lines - node->nlo, hi_threads, total_lines,
3800 node->hi_child, queue, tfp, temp_output);
3801 pthread_join (thread, nullptr);
3803 else
3805 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3806 Sort with 1 thread. */
3807 size_t nlo = node->nlo;
3808 size_t nhi = node->nhi;
3809 struct line *temp = lines - total_lines;
3810 if (1 < nhi)
3811 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3812 if (1 < nlo)
3813 sequential_sort (lines, nlo, temp, false);
3815 /* Update merge NODE. No need to lock yet. */
3816 node->lo = lines;
3817 node->hi = lines - nlo;
3818 node->end_lo = lines - nlo;
3819 node->end_hi = lines - nlo - nhi;
3821 queue_insert (queue, node);
3822 merge_loop (queue, total_lines, tfp, temp_output);
3826 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3827 the same as OUTFILE. If found, replace each with the same
3828 temporary copy that can be merged into OUTFILE without destroying
3829 OUTFILE before it is completely read. This temporary copy does not
3830 count as a merge temp, so don't worry about incrementing NTEMPS in
3831 the caller; final cleanup will remove it, not zaptemp.
3833 This test ensures that an otherwise-erroneous use like
3834 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3835 It's not clear that POSIX requires this nicety.
3836 Detect common error cases, but don't try to catch obscure cases like
3837 "cat ... FILE ... | sort -m -o FILE"
3838 where traditional "sort" doesn't copy the input and where
3839 people should know that they're getting into trouble anyway.
3840 Catching these obscure cases would slow down performance in
3841 common cases. */
3843 static void
3844 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3845 size_t nfiles, char const *outfile)
3847 struct tempnode *tempcopy = nullptr;
3849 for (size_t i = ntemps; i < nfiles; i++)
3851 bool is_stdin = STREQ (files[i].name, "-");
3852 bool same;
3853 struct stat instat;
3855 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3856 same = true;
3857 else
3859 struct stat *outst = get_outstatus ();
3860 if (!outst)
3861 break;
3863 same = (((is_stdin
3864 ? fstat (STDIN_FILENO, &instat)
3865 : stat (files[i].name, &instat))
3866 == 0)
3867 && psame_inode (&instat, outst));
3870 if (same)
3872 if (! tempcopy)
3874 FILE *tftp;
3875 tempcopy = create_temp (&tftp);
3876 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3879 files[i].name = tempcopy->name;
3880 files[i].temp = tempcopy;
3885 /* Scan the input files to ensure all are accessible.
3886 Otherwise exit with a diagnostic.
3888 This will catch common issues with permissions etc.
3889 but will fail to notice issues where you can open but not read,
3890 like when a directory is specified on some systems.
3891 Catching these obscure cases could slow down performance in
3892 common cases. */
3894 static void
3895 check_inputs (char *const *files, size_t nfiles)
3897 for (size_t i = 0; i < nfiles; i++)
3899 if (STREQ (files[i], "-"))
3900 continue;
3902 if (euidaccess (files[i], R_OK) != 0)
3903 sort_die (_("cannot read"), files[i]);
3907 /* Ensure a specified output file can be created or written to,
3908 and point stdout to it. Do not truncate the file.
3909 Exit with a diagnostic on failure. */
3911 static void
3912 check_output (char const *outfile)
3914 if (outfile)
3916 int oflags = O_WRONLY | O_BINARY | O_CLOEXEC | O_CREAT;
3917 int outfd = open (outfile, oflags, MODE_RW_UGO);
3918 if (outfd < 0)
3919 sort_die (_("open failed"), outfile);
3920 move_fd (outfd, STDOUT_FILENO);
3924 /* Merge the input FILES. NTEMPS is the number of files at the
3925 start of FILES that are temporary; it is zero at the top level.
3926 NFILES is the total number of files. Put the output in
3927 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3929 static void
3930 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3931 char const *output_file)
3933 while (nmerge < nfiles)
3935 /* Number of input files processed so far. */
3936 size_t in;
3938 /* Number of output files generated so far. */
3939 size_t out;
3941 /* nfiles % NMERGE; this counts input files that are left over
3942 after all full-sized merges have been done. */
3943 size_t remainder;
3945 /* Number of easily-available slots at the next loop iteration. */
3946 size_t cheap_slots;
3948 /* Do as many NMERGE-size merges as possible. In the case that
3949 nmerge is bogus, increment by the maximum number of file
3950 descriptors allowed. */
3951 for (out = in = 0; nmerge <= nfiles - in; out++)
3953 FILE *tfp;
3954 struct tempnode *temp = create_temp (&tfp);
3955 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3956 nmerge, tfp, temp->name);
3957 ntemps -= MIN (ntemps, num_merged);
3958 files[out].name = temp->name;
3959 files[out].temp = temp;
3960 in += num_merged;
3963 remainder = nfiles - in;
3964 cheap_slots = nmerge - out % nmerge;
3966 if (cheap_slots < remainder)
3968 /* So many files remain that they can't all be put into the last
3969 NMERGE-sized output window. Do one more merge. Merge as few
3970 files as possible, to avoid needless I/O. */
3971 size_t nshortmerge = remainder - cheap_slots + 1;
3972 FILE *tfp;
3973 struct tempnode *temp = create_temp (&tfp);
3974 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3975 nshortmerge, tfp, temp->name);
3976 ntemps -= MIN (ntemps, num_merged);
3977 files[out].name = temp->name;
3978 files[out++].temp = temp;
3979 in += num_merged;
3982 /* Put the remaining input files into the last NMERGE-sized output
3983 window, so they will be merged in the next pass. */
3984 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3985 ntemps += out;
3986 nfiles -= in - out;
3989 avoid_trashing_input (files, ntemps, nfiles, output_file);
3991 /* We aren't guaranteed that this final mergefiles will work, therefore we
3992 try to merge into the output, and then merge as much as we can into a
3993 temp file if we can't. Repeat. */
3995 while (true)
3997 /* Merge directly into the output file if possible. */
3998 FILE **fps;
3999 size_t nopened = open_input_files (files, nfiles, &fps);
4001 if (nopened == nfiles)
4003 FILE *ofp = stream_open (output_file, "w");
4004 if (ofp)
4006 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
4007 break;
4009 if (errno != EMFILE || nopened <= 2)
4010 sort_die (_("open failed"), output_file);
4012 else if (nopened <= 2)
4013 sort_die (_("open failed"), files[nopened].name);
4015 /* We ran out of file descriptors. Close one of the input
4016 files, to gain a file descriptor. Then create a temporary
4017 file with our spare file descriptor. Retry if that failed
4018 (e.g., some other process could open a file between the time
4019 we closed and tried to create). */
4020 FILE *tfp;
4021 struct tempnode *temp;
4024 nopened--;
4025 xfclose (fps[nopened], files[nopened].name);
4026 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
4028 while (!temp);
4030 /* Merge into the newly allocated temporary. */
4031 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
4032 fps);
4033 ntemps -= MIN (ntemps, nopened);
4034 files[0].name = temp->name;
4035 files[0].temp = temp;
4037 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
4038 ntemps++;
4039 nfiles -= nopened - 1;
4043 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
4045 static void
4046 sort (char *const *files, size_t nfiles, char const *output_file,
4047 size_t nthreads)
4049 struct buffer buf;
4050 size_t ntemps = 0;
4051 bool output_file_created = false;
4053 buf.alloc = 0;
4055 while (nfiles)
4057 char const *temp_output;
4058 char const *file = *files;
4059 FILE *fp = xfopen (file, "r");
4060 FILE *tfp;
4062 size_t bytes_per_line;
4063 if (nthreads > 1)
4065 /* Get log P. */
4066 size_t tmp = 1;
4067 size_t mult = 1;
4068 while (tmp < nthreads)
4070 tmp *= 2;
4071 mult++;
4073 bytes_per_line = (mult * sizeof (struct line));
4075 else
4076 bytes_per_line = sizeof (struct line) * 3 / 2;
4078 if (! buf.alloc)
4079 initbuf (&buf, bytes_per_line,
4080 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
4081 buf.eof = false;
4082 files++;
4083 nfiles--;
4085 while (fillbuf (&buf, fp, file))
4087 struct line *line;
4089 if (buf.eof && nfiles
4090 && (bytes_per_line + 1
4091 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
4093 /* End of file, but there is more input and buffer room.
4094 Concatenate the next input file; this is faster in
4095 the usual case. */
4096 buf.left = buf.used;
4097 break;
4100 saved_line.text = nullptr;
4101 line = buffer_linelim (&buf);
4102 if (buf.eof && !nfiles && !ntemps && !buf.left)
4104 xfclose (fp, file);
4105 tfp = xfopen (output_file, "w");
4106 temp_output = output_file;
4107 output_file_created = true;
4109 else
4111 ++ntemps;
4112 temp_output = create_temp (&tfp)->name;
4114 if (1 < buf.nlines)
4116 struct merge_node_queue queue;
4117 queue_init (&queue, nthreads);
4118 struct merge_node *merge_tree =
4119 merge_tree_init (nthreads, buf.nlines, line);
4121 sortlines (line, nthreads, buf.nlines, merge_tree + 1,
4122 &queue, tfp, temp_output);
4124 merge_tree_destroy (nthreads, merge_tree);
4125 queue_destroy (&queue);
4127 else
4128 write_unique (line - 1, tfp, temp_output);
4130 xfclose (tfp, temp_output);
4132 if (output_file_created)
4133 goto finish;
4135 xfclose (fp, file);
4138 finish:
4139 free (buf.buf);
4141 if (! output_file_created)
4143 struct tempnode *node = temphead;
4144 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
4145 for (size_t i = 0; node; i++)
4147 tempfiles[i].name = node->name;
4148 tempfiles[i].temp = node;
4149 node = node->next;
4151 merge (tempfiles, ntemps, ntemps, output_file);
4152 free (tempfiles);
4155 reap_all ();
4158 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
4160 static void
4161 insertkey (struct keyfield *key_arg)
4163 struct keyfield **p;
4164 struct keyfield *key = xmemdup (key_arg, sizeof *key);
4166 for (p = &keylist; *p; p = &(*p)->next)
4167 continue;
4168 *p = key;
4169 key->next = nullptr;
4172 /* Report a bad field specification SPEC, with extra info MSGID. */
4174 static void
4175 badfieldspec (char const *spec, char const *msgid)
4177 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
4178 _(msgid), quote (spec));
4181 /* Report incompatible options. */
4183 static void
4184 incompatible_options (char const *opts)
4186 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
4189 /* Check compatibility of ordering options. */
4191 static void
4192 check_ordering_compatibility (void)
4194 struct keyfield *key;
4196 for (key = keylist; key; key = key->next)
4197 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4198 + key->month + (key->version | key->random | !!key->ignore)))
4200 /* The following is too big, but guaranteed to be "big enough". */
4201 char opts[sizeof short_options];
4202 /* Clear flags we're not interested in. */
4203 key->skipsblanks = key->skipeblanks = key->reverse = false;
4204 key_to_opts (key, opts);
4205 incompatible_options (opts);
4209 /* Parse the leading integer in STRING and store the resulting value
4210 (which must fit into size_t) into *VAL. Return the address of the
4211 suffix after the integer. If the value is too large, silently
4212 substitute SIZE_MAX. If MSGID is null, return nullptr after
4213 failure; otherwise, report MSGID and exit on failure. */
4215 static char const *
4216 parse_field_count (char const *string, size_t *val, char const *msgid)
4218 char *suffix;
4219 uintmax_t n;
4221 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4223 case LONGINT_OK:
4224 case LONGINT_INVALID_SUFFIX_CHAR:
4225 *val = n;
4226 if (*val == n)
4227 break;
4228 FALLTHROUGH;
4229 case LONGINT_OVERFLOW:
4230 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4231 *val = SIZE_MAX;
4232 break;
4234 case LONGINT_INVALID:
4235 if (msgid)
4236 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4237 _(msgid), quote (string));
4238 return nullptr;
4241 return suffix;
4244 /* Handle interrupts and hangups. */
4246 static void
4247 sighandler (int sig)
4249 if (! SA_NOCLDSTOP)
4250 signal (sig, SIG_IGN);
4252 cleanup ();
4254 signal (sig, SIG_DFL);
4255 raise (sig);
4258 /* Set the ordering options for KEY specified in S.
4259 Return the address of the first character in S that
4260 is not a valid ordering option.
4261 BLANKTYPE is the kind of blanks that 'b' should skip. */
4263 static char *
4264 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4266 while (*s)
4268 switch (*s)
4270 case 'b':
4271 if (blanktype == bl_start || blanktype == bl_both)
4272 key->skipsblanks = true;
4273 if (blanktype == bl_end || blanktype == bl_both)
4274 key->skipeblanks = true;
4275 break;
4276 case 'd':
4277 key->ignore = nondictionary;
4278 break;
4279 case 'f':
4280 key->translate = fold_toupper;
4281 break;
4282 case 'g':
4283 key->general_numeric = true;
4284 break;
4285 case 'h':
4286 key->human_numeric = true;
4287 break;
4288 case 'i':
4289 /* Option order should not matter, so don't let -i override
4290 -d. -d implies -i, but -i does not imply -d. */
4291 if (! key->ignore)
4292 key->ignore = nonprinting;
4293 break;
4294 case 'M':
4295 key->month = true;
4296 break;
4297 case 'n':
4298 key->numeric = true;
4299 break;
4300 case 'R':
4301 key->random = true;
4302 break;
4303 case 'r':
4304 key->reverse = true;
4305 break;
4306 case 'V':
4307 key->version = true;
4308 break;
4309 default:
4310 return (char *) s;
4312 ++s;
4314 return (char *) s;
4317 /* Initialize KEY. */
4319 static struct keyfield *
4320 key_init (struct keyfield *key)
4322 memset (key, 0, sizeof *key);
4323 key->eword = SIZE_MAX;
4324 return key;
4328 main (int argc, char **argv)
4330 struct keyfield *key;
4331 struct keyfield key_buf;
4332 struct keyfield gkey;
4333 bool gkey_only = false;
4334 char const *s;
4335 int c = 0;
4336 char checkonly = 0;
4337 bool mergeonly = false;
4338 char *random_source = nullptr;
4339 bool need_random = false;
4340 size_t nthreads = 0;
4341 size_t nfiles = 0;
4342 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
4343 int posix_ver = posix2_version ();
4344 bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809);
4345 char **files;
4346 char *files_from = nullptr;
4347 struct Tokens tok;
4348 char const *outfile = nullptr;
4349 bool locale_ok;
4351 initialize_main (&argc, &argv);
4352 set_program_name (argv[0]);
4353 locale_ok = !! setlocale (LC_ALL, "");
4354 bindtextdomain (PACKAGE, LOCALEDIR);
4355 textdomain (PACKAGE);
4357 initialize_exit_failure (SORT_FAILURE);
4359 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4360 #if HAVE_NL_LANGINFO
4361 hard_LC_TIME = hard_locale (LC_TIME);
4362 #endif
4364 /* Get locale's representation of the decimal point. */
4366 struct lconv const *locale = localeconv ();
4368 /* If the locale doesn't define a decimal point, or if the decimal
4369 point is multibyte, use the C locale's decimal point. FIXME:
4370 add support for multibyte decimal points. */
4371 decimal_point = locale->decimal_point[0];
4372 if (! decimal_point || locale->decimal_point[1])
4373 decimal_point = '.';
4375 /* FIXME: add support for multibyte thousands separators. */
4376 thousands_sep = locale->thousands_sep[0];
4377 if (thousands_sep && locale->thousands_sep[1])
4378 thousands_sep_ignored = true;
4379 if (! thousands_sep || locale->thousands_sep[1])
4380 thousands_sep = NON_CHAR;
4383 have_read_stdin = false;
4384 inittables ();
4387 size_t i;
4388 static int const sig[] =
4390 /* The usual suspects. */
4391 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4392 #ifdef SIGPOLL
4393 SIGPOLL,
4394 #endif
4395 #ifdef SIGPROF
4396 SIGPROF,
4397 #endif
4398 #ifdef SIGVTALRM
4399 SIGVTALRM,
4400 #endif
4401 #ifdef SIGXCPU
4402 SIGXCPU,
4403 #endif
4404 #ifdef SIGXFSZ
4405 SIGXFSZ,
4406 #endif
4408 enum { nsigs = ARRAY_CARDINALITY (sig) };
4410 #if SA_NOCLDSTOP
4411 struct sigaction act;
4413 sigemptyset (&caught_signals);
4414 for (i = 0; i < nsigs; i++)
4416 sigaction (sig[i], nullptr, &act);
4417 if (act.sa_handler != SIG_IGN)
4418 sigaddset (&caught_signals, sig[i]);
4421 act.sa_handler = sighandler;
4422 act.sa_mask = caught_signals;
4423 act.sa_flags = 0;
4425 for (i = 0; i < nsigs; i++)
4426 if (sigismember (&caught_signals, sig[i]))
4427 sigaction (sig[i], &act, nullptr);
4428 #else
4429 for (i = 0; i < nsigs; i++)
4430 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4432 signal (sig[i], sighandler);
4433 siginterrupt (sig[i], 1);
4435 #endif
4437 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4439 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4440 atexit (exit_cleanup);
4442 key_init (&gkey);
4443 gkey.sword = SIZE_MAX;
4445 files = xnmalloc (argc, sizeof *files);
4447 while (true)
4449 /* Parse an operand as a file after "--" was seen; or if
4450 pedantic and a file was seen, unless the POSIX version
4451 is not 1003.1-2001 and -c was not seen and the operand is
4452 "-o FILE" or "-oFILE". */
4453 int oi = -1;
4455 if (c == -1
4456 || (posixly_correct && nfiles != 0
4457 && ! (traditional_usage
4458 && ! checkonly
4459 && optind != argc
4460 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4461 && (argv[optind][2] || optind + 1 != argc)))
4462 || ((c = getopt_long (argc, argv, short_options,
4463 long_options, &oi))
4464 == -1))
4466 if (argc <= optind)
4467 break;
4468 files[nfiles++] = argv[optind++];
4470 else switch (c)
4472 case 1:
4473 key = nullptr;
4474 if (optarg[0] == '+')
4476 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4477 && ISDIGIT (argv[optind][1]));
4478 traditional_usage |= minus_pos_usage && !posixly_correct;
4479 if (traditional_usage)
4481 /* Treat +POS1 [-POS2] as a key if possible; but silently
4482 treat an operand as a file if it is not a valid +POS1. */
4483 key = key_init (&key_buf);
4484 s = parse_field_count (optarg + 1, &key->sword, nullptr);
4485 if (s && *s == '.')
4486 s = parse_field_count (s + 1, &key->schar, nullptr);
4487 if (! (key->sword || key->schar))
4488 key->sword = SIZE_MAX;
4489 if (! s || *set_ordering (s, key, bl_start))
4490 key = nullptr;
4491 else
4493 if (minus_pos_usage)
4495 char const *optarg1 = argv[optind++];
4496 s = parse_field_count (optarg1 + 1, &key->eword,
4497 N_("invalid number after '-'"));
4498 if (*s == '.')
4499 s = parse_field_count (s + 1, &key->echar,
4500 N_("invalid number after '.'"));
4501 if (!key->echar && key->eword)
4503 /* obsolescent syntax +A.x -B.y is equivalent to:
4504 -k A+1.x+1,B.y (when y = 0)
4505 -k A+1.x+1,B+1.y (when y > 0)
4506 So eword is decremented as in the -k case
4507 only when the end field (B) is specified and
4508 echar (y) is 0. */
4509 key->eword--;
4511 if (*set_ordering (s, key, bl_end))
4512 badfieldspec (optarg1,
4513 N_("stray character in field spec"));
4515 key->traditional_used = true;
4516 insertkey (key);
4520 if (! key)
4521 files[nfiles++] = optarg;
4522 break;
4524 case SORT_OPTION:
4525 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4526 FALLTHROUGH;
4527 case 'b':
4528 case 'd':
4529 case 'f':
4530 case 'g':
4531 case 'h':
4532 case 'i':
4533 case 'M':
4534 case 'n':
4535 case 'r':
4536 case 'R':
4537 case 'V':
4539 char str[2];
4540 str[0] = c;
4541 str[1] = '\0';
4542 set_ordering (str, &gkey, bl_both);
4544 break;
4546 case CHECK_OPTION:
4547 c = (optarg
4548 ? XARGMATCH ("--check", optarg, check_args, check_types)
4549 : 'c');
4550 FALLTHROUGH;
4551 case 'c':
4552 case 'C':
4553 if (checkonly && checkonly != c)
4554 incompatible_options ("cC");
4555 checkonly = c;
4556 break;
4558 case COMPRESS_PROGRAM_OPTION:
4559 if (compress_program && !STREQ (compress_program, optarg))
4560 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4561 compress_program = optarg;
4562 break;
4564 case DEBUG_PROGRAM_OPTION:
4565 debug = true;
4566 break;
4568 case FILES0_FROM_OPTION:
4569 files_from = optarg;
4570 break;
4572 case 'k':
4573 key = key_init (&key_buf);
4575 /* Get POS1. */
4576 s = parse_field_count (optarg, &key->sword,
4577 N_("invalid number at field start"));
4578 if (! key->sword--)
4580 /* Provoke with 'sort -k0' */
4581 badfieldspec (optarg, N_("field number is zero"));
4583 if (*s == '.')
4585 s = parse_field_count (s + 1, &key->schar,
4586 N_("invalid number after '.'"));
4587 if (! key->schar--)
4589 /* Provoke with 'sort -k1.0' */
4590 badfieldspec (optarg, N_("character offset is zero"));
4593 if (! (key->sword || key->schar))
4594 key->sword = SIZE_MAX;
4595 s = set_ordering (s, key, bl_start);
4596 if (*s != ',')
4598 key->eword = SIZE_MAX;
4599 key->echar = 0;
4601 else
4603 /* Get POS2. */
4604 s = parse_field_count (s + 1, &key->eword,
4605 N_("invalid number after ','"));
4606 if (! key->eword--)
4608 /* Provoke with 'sort -k1,0' */
4609 badfieldspec (optarg, N_("field number is zero"));
4611 if (*s == '.')
4613 s = parse_field_count (s + 1, &key->echar,
4614 N_("invalid number after '.'"));
4616 s = set_ordering (s, key, bl_end);
4618 if (*s)
4619 badfieldspec (optarg, N_("stray character in field spec"));
4620 insertkey (key);
4621 break;
4623 case 'm':
4624 mergeonly = true;
4625 break;
4627 case NMERGE_OPTION:
4628 specify_nmerge (oi, c, optarg);
4629 break;
4631 case 'o':
4632 if (outfile && !STREQ (outfile, optarg))
4633 error (SORT_FAILURE, 0, _("multiple output files specified"));
4634 outfile = optarg;
4635 break;
4637 case RANDOM_SOURCE_OPTION:
4638 if (random_source && !STREQ (random_source, optarg))
4639 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4640 random_source = optarg;
4641 break;
4643 case 's':
4644 stable = true;
4645 break;
4647 case 'S':
4648 specify_sort_size (oi, c, optarg);
4649 break;
4651 case 't':
4653 char newtab = optarg[0];
4654 if (! newtab)
4655 error (SORT_FAILURE, 0, _("empty tab"));
4656 if (optarg[1])
4658 if (STREQ (optarg, "\\0"))
4659 newtab = '\0';
4660 else
4662 /* Provoke with 'sort -txx'. Complain about
4663 "multi-character tab" instead of "multibyte tab", so
4664 that the diagnostic's wording does not need to be
4665 changed once multibyte characters are supported. */
4666 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4667 quote (optarg));
4670 if (tab != TAB_DEFAULT && tab != newtab)
4671 error (SORT_FAILURE, 0, _("incompatible tabs"));
4672 tab = newtab;
4674 break;
4676 case 'T':
4677 add_temp_dir (optarg);
4678 break;
4680 case PARALLEL_OPTION:
4681 nthreads = specify_nthreads (oi, c, optarg);
4682 break;
4684 case 'u':
4685 unique = true;
4686 break;
4688 case 'y':
4689 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4690 through Solaris 7. It is also accepted by many non-Solaris
4691 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4692 -y is marked as obsolete starting with Solaris 8 (1999), but is
4693 still accepted as of Solaris 10 prerelease (2004).
4695 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4696 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4697 and which in general ignores the argument after "-y" if it
4698 consists entirely of digits (it can even be empty). */
4699 if (optarg == argv[optind - 1])
4701 char const *p;
4702 for (p = optarg; ISDIGIT (*p); p++)
4703 continue;
4704 optind -= (*p != '\0');
4706 break;
4708 case 'z':
4709 eolchar = 0;
4710 break;
4712 case_GETOPT_HELP_CHAR;
4714 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4716 default:
4717 usage (SORT_FAILURE);
4721 if (files_from)
4723 /* When using --files0-from=F, you may not specify any files
4724 on the command-line. */
4725 if (nfiles)
4727 error (0, 0, _("extra operand %s"), quoteaf (files[0]));
4728 fprintf (stderr, "%s\n",
4729 _("file operands cannot be combined with --files0-from"));
4730 usage (SORT_FAILURE);
4733 FILE *stream = xfopen (files_from, "r");
4735 readtokens0_init (&tok);
4737 if (! readtokens0 (stream, &tok))
4738 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4739 quoteaf (files_from));
4740 xfclose (stream, files_from);
4742 if (tok.n_tok)
4744 free (files);
4745 files = tok.tok;
4746 nfiles = tok.n_tok;
4747 for (size_t i = 0; i < nfiles; i++)
4749 if (STREQ (files[i], "-"))
4750 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4751 "no file name of %s allowed"),
4752 quoteaf (files[i]));
4753 else if (files[i][0] == '\0')
4755 /* Using the standard 'filename:line-number:' prefix here is
4756 not totally appropriate, since NUL is the separator,
4757 not NL, but it might be better than nothing. */
4758 unsigned long int file_number = i + 1;
4759 error (SORT_FAILURE, 0,
4760 _("%s:%lu: invalid zero-length file name"),
4761 quotef (files_from), file_number);
4765 else
4766 error (SORT_FAILURE, 0, _("no input from %s"),
4767 quoteaf (files_from));
4770 /* Inheritance of global options to individual keys. */
4771 for (key = keylist; key; key = key->next)
4773 if (default_key_compare (key) && !key->reverse)
4775 key->ignore = gkey.ignore;
4776 key->translate = gkey.translate;
4777 key->skipsblanks = gkey.skipsblanks;
4778 key->skipeblanks = gkey.skipeblanks;
4779 key->month = gkey.month;
4780 key->numeric = gkey.numeric;
4781 key->general_numeric = gkey.general_numeric;
4782 key->human_numeric = gkey.human_numeric;
4783 key->version = gkey.version;
4784 key->random = gkey.random;
4785 key->reverse = gkey.reverse;
4788 need_random |= key->random;
4791 if (!keylist && !default_key_compare (&gkey))
4793 gkey_only = true;
4794 insertkey (&gkey);
4795 need_random |= gkey.random;
4798 check_ordering_compatibility ();
4800 if (debug)
4802 if (checkonly || outfile)
4804 static char opts[] = "X --debug";
4805 opts[0] = (checkonly ? checkonly : 'o');
4806 incompatible_options (opts);
4809 /* Always output the locale in debug mode, since this
4810 is such a common source of confusion. */
4812 /* OpenBSD can only set some categories with LC_ALL above,
4813 so set LC_COLLATE explicitly to flag errors. */
4814 if (locale_ok)
4815 locale_ok = !! setlocale (LC_COLLATE, "");
4816 if (! locale_ok)
4817 error (0, 0, "%s", _("failed to set locale"));
4818 if (hard_LC_COLLATE)
4819 error (0, 0, _("text ordering performed using %s sorting rules"),
4820 quote (setlocale (LC_COLLATE, nullptr)));
4821 else
4822 error (0, 0, "%s",
4823 _("text ordering performed using simple byte comparison"));
4825 key_warnings (&gkey, gkey_only);
4828 reverse = gkey.reverse;
4830 if (need_random)
4831 random_md5_state_init (random_source);
4833 if (temp_dir_count == 0)
4835 char const *tmp_dir = getenv ("TMPDIR");
4836 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4839 if (nfiles == 0)
4841 nfiles = 1;
4842 free (files);
4843 files = xmalloc (sizeof *files);
4844 *files = (char *) "-";
4847 /* Need to re-check that we meet the minimum requirement for memory
4848 usage with the final value for NMERGE. */
4849 if (0 < sort_size)
4850 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4852 if (checkonly)
4854 if (nfiles > 1)
4855 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4856 quoteaf (files[1]), checkonly);
4858 if (outfile)
4860 static char opts[] = {0, 'o', 0};
4861 opts[0] = checkonly;
4862 incompatible_options (opts);
4865 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4866 input is not properly sorted. */
4867 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4870 /* Check all inputs are accessible, or exit immediately. */
4871 check_inputs (files, nfiles);
4873 /* Check output is writable, or exit immediately. */
4874 check_output (outfile);
4876 if (mergeonly)
4878 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4880 for (size_t i = 0; i < nfiles; ++i)
4881 sortfiles[i].name = files[i];
4883 merge (sortfiles, 0, nfiles, outfile);
4885 else
4887 if (!nthreads)
4889 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4890 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4893 /* Avoid integer overflow later. */
4894 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4895 nthreads = MIN (nthreads, nthreads_max);
4897 sort (files, nfiles, outfile, nthreads);
4900 if (have_read_stdin && fclose (stdin) == EOF)
4901 sort_die (_("close failed"), "-");
4903 main_exit (EXIT_SUCCESS);