tests: avoid false failure with unicode decomposed file systems
[coreutils.git] / src / sort.c
blob7205007745041d27e9b501b669e4e543f4da9751
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <ctype.h>
26 #include <getopt.h>
27 #include <pthread.h>
28 #include <sys/resource.h>
29 #include <sys/types.h>
30 #include <sys/wait.h>
31 #include <signal.h>
32 #include "system.h"
33 #include "argmatch.h"
34 #include "assure.h"
35 #include "fadvise.h"
36 #include "filevercmp.h"
37 #include "flexmember.h"
38 #include "hard-locale.h"
39 #include "hash.h"
40 #include "heap.h"
41 #include "ignore-value.h"
42 #include "mbswidth.h"
43 #include "nproc.h"
44 #include "physmem.h"
45 #include "posixver.h"
46 #include "quote.h"
47 #include "randread.h"
48 #include "readtokens0.h"
49 #include "stdlib--.h"
50 #include "strnumcmp.h"
51 #include "xmemcoll.h"
52 #include "xnanosleep.h"
53 #include "xstrtol.h"
54 #include "xstrtol-error.h"
56 #ifndef RLIMIT_DATA
57 struct rlimit { size_t rlim_cur; };
58 # define getrlimit(Resource, Rlp) (-1)
59 #endif
61 /* The official name of this program (e.g., no 'g' prefix). */
62 #define PROGRAM_NAME "sort"
64 #define AUTHORS \
65 proper_name ("Mike Haertel"), \
66 proper_name ("Paul Eggert")
68 #if HAVE_LANGINFO_CODESET
69 # include <langinfo.h>
70 #endif
72 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
73 present. */
74 #ifndef SA_NOCLDSTOP
75 # define SA_NOCLDSTOP 0
76 /* No sigprocmask. Always 'return' zero. */
77 # define sigprocmask(How, Set, Oset) (0)
78 # define sigset_t int
79 # if ! HAVE_SIGINTERRUPT
80 # define siginterrupt(sig, flag) /* empty */
81 # endif
82 #endif
84 #if !defined OPEN_MAX && defined NR_OPEN
85 # define OPEN_MAX NR_OPEN
86 #endif
87 #if !defined OPEN_MAX
88 # define OPEN_MAX 20
89 #endif
91 #define UCHAR_LIM (UCHAR_MAX + 1)
93 #ifndef DEFAULT_TMPDIR
94 # define DEFAULT_TMPDIR "/tmp"
95 #endif
97 /* Maximum number of lines to merge every time a NODE is taken from
98 the merge queue. Node is at LEVEL in the binary merge tree,
99 and is responsible for merging TOTAL lines. */
100 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
102 /* Heuristic value for the number of lines for which it is worth creating
103 a subthread, during an internal merge sort. I.e., it is a small number
104 of "average" lines for which sorting via two threads is faster than
105 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
106 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
107 lines of gensort -a output is sorted slightly faster with --parallel=2
108 than with --parallel=1. By contrast, using --parallel=1 is about 10%
109 faster than using --parallel=2 with a 64K-line input. */
110 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
111 static_assert (4 <= SUBTHREAD_LINES_HEURISTIC);
113 /* The number of threads after which there are
114 diminishing performance gains. */
115 enum { DEFAULT_MAX_THREADS = 8 };
117 /* Exit statuses. */
118 enum
120 /* POSIX says to exit with status 1 if invoked with -c and the
121 input is not properly sorted. */
122 SORT_OUT_OF_ORDER = 1,
124 /* POSIX says any other irregular exit must exit with a status
125 code greater than 1. */
126 SORT_FAILURE = 2
129 enum
131 /* The number of times we should try to fork a compression process
132 (we retry if the fork call fails). We don't _need_ to compress
133 temp files, this is just to reduce file system access, so this number
134 can be small. Each retry doubles in duration. */
135 MAX_FORK_TRIES_COMPRESS = 4,
137 /* The number of times we should try to fork a decompression process.
138 If we can't fork a decompression process, we can't sort, so this
139 number should be big. Each retry doubles in duration. */
140 MAX_FORK_TRIES_DECOMPRESS = 9
143 enum
145 /* Level of the end-of-merge node, one level above the root. */
146 MERGE_END = 0,
148 /* Level of the root node in merge tree. */
149 MERGE_ROOT = 1
152 /* The representation of the decimal point in the current locale. */
153 static char decimal_point;
155 /* Thousands separator; if outside char range, there is no separator. */
156 static int thousands_sep;
157 /* We currently ignore multi-byte grouping chars. */
158 static bool thousands_sep_ignored;
160 /* Nonzero if the corresponding locales are hard. */
161 static bool hard_LC_COLLATE;
162 #if HAVE_NL_LANGINFO
163 static bool hard_LC_TIME;
164 #endif
166 #define NONZERO(x) ((x) != 0)
168 /* The kind of blanks for '-b' to skip in various options. */
169 enum blanktype { bl_start, bl_end, bl_both };
171 /* The character marking end of line. Default to \n. */
172 static char eolchar = '\n';
174 /* Lines are held in memory as counted strings. */
175 struct line
177 char *text; /* Text of the line. */
178 size_t length; /* Length including final newline. */
179 char *keybeg; /* Start of first key. */
180 char *keylim; /* Limit of first key. */
183 /* Input buffers. */
184 struct buffer
186 char *buf; /* Dynamically allocated buffer,
187 partitioned into 3 regions:
188 - input data;
189 - unused area;
190 - an array of lines, in reverse order. */
191 size_t used; /* Number of bytes used for input data. */
192 size_t nlines; /* Number of lines in the line array. */
193 idx_t alloc; /* Number of bytes allocated. */
194 size_t left; /* Number of bytes left from previous reads. */
195 size_t line_bytes; /* Number of bytes to reserve for each line. */
196 bool eof; /* An EOF has been read. */
199 /* Sort key. */
200 struct keyfield
202 size_t sword; /* Zero-origin 'word' to start at. */
203 size_t schar; /* Additional characters to skip. */
204 size_t eword; /* Zero-origin last 'word' of key. */
205 size_t echar; /* Additional characters in field. */
206 bool const *ignore; /* Boolean array of characters to ignore. */
207 char const *translate; /* Translation applied to characters. */
208 bool skipsblanks; /* Skip leading blanks when finding start. */
209 bool skipeblanks; /* Skip leading blanks when finding end. */
210 bool numeric; /* Flag for numeric comparison. Handle
211 strings of digits with optional decimal
212 point, but no exponential notation. */
213 bool random; /* Sort by random hash of key. */
214 bool general_numeric; /* Flag for general, numeric comparison.
215 Handle numbers in exponential notation. */
216 bool human_numeric; /* Flag for sorting by human readable
217 units with either SI or IEC prefixes. */
218 bool month; /* Flag for comparison by month name. */
219 bool reverse; /* Reverse the sense of comparison. */
220 bool version; /* sort by version number */
221 bool traditional_used; /* Traditional key option format is used. */
222 struct keyfield *next; /* Next keyfield to try. */
225 struct month
227 char const *name;
228 int val;
231 /* Binary merge tree node. */
232 struct merge_node
234 struct line *lo; /* Lines to merge from LO child node. */
235 struct line *hi; /* Lines to merge from HI child node. */
236 struct line *end_lo; /* End of available lines from LO. */
237 struct line *end_hi; /* End of available lines from HI. */
238 struct line **dest; /* Pointer to destination of merge. */
239 size_t nlo; /* Total Lines remaining from LO. */
240 size_t nhi; /* Total lines remaining from HI. */
241 struct merge_node *parent; /* Parent node. */
242 struct merge_node *lo_child; /* LO child node. */
243 struct merge_node *hi_child; /* HI child node. */
244 unsigned int level; /* Level in merge tree. */
245 bool queued; /* Node is already in heap. */
246 pthread_mutex_t lock; /* Lock for node operations. */
249 /* Priority queue of merge nodes. */
250 struct merge_node_queue
252 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
253 pthread_mutex_t mutex; /* Lock for queue operations. */
254 pthread_cond_t cond; /* Conditional wait for empty queue to populate
255 when popping. */
258 /* Used to implement --unique (-u). */
259 static struct line saved_line;
261 /* FIXME: None of these tables work with multibyte character sets.
262 Also, there are many other bugs when handling multibyte characters.
263 One way to fix this is to rewrite 'sort' to use wide characters
264 internally, but doing this with good performance is a bit
265 tricky. */
267 /* Table of blanks. */
268 static bool blanks[UCHAR_LIM];
270 /* Table of non-printing characters. */
271 static bool nonprinting[UCHAR_LIM];
273 /* Table of non-dictionary characters (not letters, digits, or blanks). */
274 static bool nondictionary[UCHAR_LIM];
276 /* Translation table folding lower case to upper. */
277 static char fold_toupper[UCHAR_LIM];
279 #define MONTHS_PER_YEAR 12
281 /* Table mapping month names to integers.
282 Alphabetic order allows binary search. */
283 static struct month monthtab[] =
285 {"APR", 4},
286 {"AUG", 8},
287 {"DEC", 12},
288 {"FEB", 2},
289 {"JAN", 1},
290 {"JUL", 7},
291 {"JUN", 6},
292 {"MAR", 3},
293 {"MAY", 5},
294 {"NOV", 11},
295 {"OCT", 10},
296 {"SEP", 9}
299 /* During the merge phase, the number of files to merge at once. */
300 #define NMERGE_DEFAULT 16
302 /* Minimum size for a merge or check buffer. */
303 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
305 /* Minimum sort size; the code might not work with smaller sizes. */
306 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
308 /* The number of bytes needed for a merge or check buffer, which can
309 function relatively efficiently even if it holds only one line. If
310 a longer line is seen, this value is increased. */
311 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
313 /* The approximate maximum number of bytes of main memory to use, as
314 specified by the user. Zero if the user has not specified a size. */
315 static size_t sort_size;
317 /* The initial allocation factor for non-regular files.
318 This is used, e.g., when reading from a pipe.
319 Don't make it too big, since it is multiplied by ~130 to
320 obtain the size of the actual buffer sort will allocate.
321 Also, there may be 8 threads all doing this at the same time. */
322 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
324 /* Array of directory names in which any temporary files are to be created. */
325 static char const **temp_dirs;
327 /* Number of temporary directory names used. */
328 static idx_t temp_dir_count;
330 /* Number of allocated slots in temp_dirs. */
331 static idx_t temp_dir_alloc;
333 /* Flag to reverse the order of all comparisons. */
334 static bool reverse;
336 /* Flag for stable sort. This turns off the last ditch bytewise
337 comparison of lines, and instead leaves lines in the same order
338 they were read if all keys compare equal. */
339 static bool stable;
341 /* An int value outside char range. */
342 enum { NON_CHAR = CHAR_MAX + 1 };
344 /* If TAB has this value, blanks separate fields. */
345 enum { TAB_DEFAULT = CHAR_MAX + 1 };
347 /* Tab character separating fields. If TAB_DEFAULT, then fields are
348 separated by the empty string between a non-blank character and a blank
349 character. */
350 static int tab = TAB_DEFAULT;
352 /* Flag to remove consecutive duplicate lines from the output.
353 Only the last of a sequence of equal lines will be output. */
354 static bool unique;
356 /* Nonzero if any of the input files are the standard input. */
357 static bool have_read_stdin;
359 /* List of key field comparisons to be tried. */
360 static struct keyfield *keylist;
362 /* Program used to (de)compress temp files. Must accept -d. */
363 static char const *compress_program;
365 /* Annotate the output with extra info to aid the user. */
366 static bool debug;
368 /* Maximum number of files to merge in one go. If more than this
369 number are present, temp files will be used. */
370 static unsigned int nmerge = NMERGE_DEFAULT;
372 /* Output an error to stderr and exit using async-signal-safe routines.
373 This can be used safely from signal handlers,
374 and between fork and exec of multithreaded processes. */
376 static _Noreturn void
377 async_safe_die (int errnum, char const *errstr)
379 ignore_value (write (STDERR_FILENO, errstr, strlen (errstr)));
381 /* Even if defined HAVE_STRERROR_R, we can't use it,
382 as it may return a translated string etc. and even if not
383 may call malloc which is unsafe. We might improve this
384 by testing for sys_errlist and using that if available.
385 For now just report the error number. */
386 if (errnum)
388 char errbuf[INT_BUFSIZE_BOUND (errnum)];
389 char *p = inttostr (errnum, errbuf);
390 ignore_value (write (STDERR_FILENO, ": errno ", 8));
391 ignore_value (write (STDERR_FILENO, p, strlen (p)));
394 ignore_value (write (STDERR_FILENO, "\n", 1));
396 _exit (SORT_FAILURE);
399 /* Report MESSAGE for FILE, then clean up and exit.
400 If FILE is null, it represents standard output. */
402 static void
403 sort_die (char const *message, char const *file)
405 error (SORT_FAILURE, errno, "%s: %s", message,
406 quotef (file ? file : _("standard output")));
409 void
410 usage (int status)
412 if (status != EXIT_SUCCESS)
413 emit_try_help ();
414 else
416 printf (_("\
417 Usage: %s [OPTION]... [FILE]...\n\
418 or: %s [OPTION]... --files0-from=F\n\
420 program_name, program_name);
421 fputs (_("\
422 Write sorted concatenation of all FILE(s) to standard output.\n\
423 "), stdout);
425 emit_stdin_note ();
426 emit_mandatory_arg_note ();
428 fputs (_("\
429 Ordering options:\n\
431 "), stdout);
432 fputs (_("\
433 -b, --ignore-leading-blanks ignore leading blanks\n\
434 -d, --dictionary-order consider only blanks and alphanumeric characters\
436 -f, --ignore-case fold lower case to upper case characters\n\
437 "), stdout);
438 fputs (_("\
439 -g, --general-numeric-sort compare according to general numerical value\n\
440 -i, --ignore-nonprinting consider only printable characters\n\
441 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
442 "), stdout);
443 fputs (_("\
444 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
445 "), stdout);
446 fputs (_("\
447 -n, --numeric-sort compare according to string numerical value;\n\
448 see full documentation for supported strings\n\
449 "), stdout);
450 fputs (_("\
451 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
452 --random-source=FILE get random bytes from FILE\n\
453 -r, --reverse reverse the result of comparisons\n\
454 "), stdout);
455 fputs (_("\
456 --sort=WORD sort according to WORD:\n\
457 general-numeric -g, human-numeric -h, month -M,\
459 numeric -n, random -R, version -V\n\
460 -V, --version-sort natural sort of (version) numbers within text\n\
462 "), stdout);
463 fputs (_("\
464 Other options:\n\
466 "), stdout);
467 fputs (_("\
468 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
469 for more use temp files\n\
470 "), stdout);
471 fputs (_("\
472 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
473 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
475 --compress-program=PROG compress temporaries with PROG;\n\
476 decompress them with PROG -d\n\
477 "), stdout);
478 fputs (_("\
479 --debug annotate the part of the line used to sort,\n\
480 and warn about questionable usage to stderr\n\
481 --files0-from=F read input from the files specified by\n\
482 NUL-terminated names in file F;\n\
483 If F is - then read names from standard input\n\
484 "), stdout);
485 fputs (_("\
486 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
487 -m, --merge merge already sorted files; do not sort\n\
488 "), stdout);
489 fputs (_("\
490 -o, --output=FILE write result to FILE instead of standard output\n\
491 -s, --stable stabilize sort by disabling last-resort comparison\
493 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
494 "), stdout);
495 printf (_("\
496 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
497 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
498 multiple options specify multiple directories\n\
499 --parallel=N change the number of sorts run concurrently to N\n\
500 -u, --unique with -c, check for strict ordering;\n\
501 without -c, output only the first of an equal run\
503 "), DEFAULT_TMPDIR);
504 fputs (_("\
505 -z, --zero-terminated line delimiter is NUL, not newline\n\
506 "), stdout);
507 fputs (HELP_OPTION_DESCRIPTION, stdout);
508 fputs (VERSION_OPTION_DESCRIPTION, stdout);
509 fputs (_("\
511 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
512 field number and C a character position in the field; both are origin 1, and\n\
513 the stop position defaults to the line's end. If neither -t nor -b is in\n\
514 effect, characters in a field are counted from the beginning of the preceding\n\
515 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
517 which override global ordering options for that key. If no key is given, use\n\
518 the entire line as the key. Use --debug to diagnose incorrect key usage.\n\
520 SIZE may be followed by the following multiplicative suffixes:\n\
521 "), stdout);
522 fputs (_("\
523 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y, R, Q.\
524 \n\n\
525 *** WARNING ***\n\
526 The locale specified by the environment affects sort order.\n\
527 Set LC_ALL=C to get the traditional sort order that uses\n\
528 native byte values.\n\
529 "), stdout );
530 emit_ancillary_info (PROGRAM_NAME);
533 exit (status);
536 /* For long options that have no equivalent short option, use a
537 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
538 enum
540 CHECK_OPTION = CHAR_MAX + 1,
541 COMPRESS_PROGRAM_OPTION,
542 DEBUG_PROGRAM_OPTION,
543 FILES0_FROM_OPTION,
544 NMERGE_OPTION,
545 RANDOM_SOURCE_OPTION,
546 SORT_OPTION,
547 PARALLEL_OPTION
550 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
552 static struct option const long_options[] =
554 {"ignore-leading-blanks", no_argument, nullptr, 'b'},
555 {"check", optional_argument, nullptr, CHECK_OPTION},
556 {"compress-program", required_argument, nullptr, COMPRESS_PROGRAM_OPTION},
557 {"debug", no_argument, nullptr, DEBUG_PROGRAM_OPTION},
558 {"dictionary-order", no_argument, nullptr, 'd'},
559 {"ignore-case", no_argument, nullptr, 'f'},
560 {"files0-from", required_argument, nullptr, FILES0_FROM_OPTION},
561 {"general-numeric-sort", no_argument, nullptr, 'g'},
562 {"ignore-nonprinting", no_argument, nullptr, 'i'},
563 {"key", required_argument, nullptr, 'k'},
564 {"merge", no_argument, nullptr, 'm'},
565 {"month-sort", no_argument, nullptr, 'M'},
566 {"numeric-sort", no_argument, nullptr, 'n'},
567 {"human-numeric-sort", no_argument, nullptr, 'h'},
568 {"version-sort", no_argument, nullptr, 'V'},
569 {"random-sort", no_argument, nullptr, 'R'},
570 {"random-source", required_argument, nullptr, RANDOM_SOURCE_OPTION},
571 {"sort", required_argument, nullptr, SORT_OPTION},
572 {"output", required_argument, nullptr, 'o'},
573 {"reverse", no_argument, nullptr, 'r'},
574 {"stable", no_argument, nullptr, 's'},
575 {"batch-size", required_argument, nullptr, NMERGE_OPTION},
576 {"buffer-size", required_argument, nullptr, 'S'},
577 {"field-separator", required_argument, nullptr, 't'},
578 {"temporary-directory", required_argument, nullptr, 'T'},
579 {"unique", no_argument, nullptr, 'u'},
580 {"zero-terminated", no_argument, nullptr, 'z'},
581 {"parallel", required_argument, nullptr, PARALLEL_OPTION},
582 {GETOPT_HELP_OPTION_DECL},
583 {GETOPT_VERSION_OPTION_DECL},
584 {nullptr, 0, nullptr, 0},
587 #define CHECK_TABLE \
588 _ct_("quiet", 'C') \
589 _ct_("silent", 'C') \
590 _ct_("diagnose-first", 'c')
592 static char const *const check_args[] =
594 #define _ct_(_s, _c) _s,
595 CHECK_TABLE nullptr
596 #undef _ct_
598 static char const check_types[] =
600 #define _ct_(_s, _c) _c,
601 CHECK_TABLE
602 #undef _ct_
605 #define SORT_TABLE \
606 _st_("general-numeric", 'g') \
607 _st_("human-numeric", 'h') \
608 _st_("month", 'M') \
609 _st_("numeric", 'n') \
610 _st_("random", 'R') \
611 _st_("version", 'V')
613 static char const *const sort_args[] =
615 #define _st_(_s, _c) _s,
616 SORT_TABLE nullptr
617 #undef _st_
619 static char const sort_types[] =
621 #define _st_(_s, _c) _c,
622 SORT_TABLE
623 #undef _st_
626 /* The set of signals that are caught. */
627 static sigset_t caught_signals;
629 /* Critical section status. */
630 struct cs_status
632 bool valid;
633 sigset_t sigs;
636 /* Enter a critical section. */
637 static void
638 cs_enter (struct cs_status *status)
640 int ret = pthread_sigmask (SIG_BLOCK, &caught_signals, &status->sigs);
641 status->valid = ret == 0;
644 /* Leave a critical section. */
645 static void
646 cs_leave (struct cs_status const *status)
648 if (status->valid)
650 /* Ignore failure when restoring the signal mask. */
651 pthread_sigmask (SIG_SETMASK, &status->sigs, nullptr);
655 /* Possible states for a temp file. If compressed, the file's status
656 is unreaped or reaped, depending on whether 'sort' has waited for
657 the subprocess to finish. */
658 enum { UNCOMPRESSED, UNREAPED, REAPED };
660 /* The list of temporary files. */
661 struct tempnode
663 struct tempnode *volatile next;
664 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
665 char state;
666 char name[FLEXIBLE_ARRAY_MEMBER];
668 static struct tempnode *volatile temphead;
669 static struct tempnode *volatile *temptail = &temphead;
671 /* A file to be sorted. */
672 struct sortfile
674 /* The file's name. */
675 char const *name;
677 /* Non-null if this is a temporary file, in which case NAME == TEMP->name. */
678 struct tempnode *temp;
681 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
682 static Hash_table *proctab;
684 enum { INIT_PROCTAB_SIZE = 47 };
686 static size_t
687 proctab_hasher (void const *entry, size_t tabsize)
689 struct tempnode const *node = entry;
690 return node->pid % tabsize;
693 static bool
694 proctab_comparator (void const *e1, void const *e2)
696 struct tempnode const *n1 = e1;
697 struct tempnode const *n2 = e2;
698 return n1->pid == n2->pid;
701 /* The number of unreaped child processes. */
702 static pid_t nprocs;
704 static bool delete_proc (pid_t);
706 /* If PID is positive, wait for the child process with that PID to
707 exit, and assume that PID has already been removed from the process
708 table. If PID is 0 or -1, clean up some child that has exited (by
709 waiting for it, and removing it from the proc table) and return the
710 child's process ID. However, if PID is 0 and no children have
711 exited, return 0 without waiting. */
713 static pid_t
714 reap (pid_t pid)
716 int status;
717 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
719 if (cpid < 0)
720 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
721 quoteaf (compress_program));
722 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
724 if (! WIFEXITED (status) || WEXITSTATUS (status))
725 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
726 quoteaf (compress_program));
727 --nprocs;
730 return cpid;
733 /* TEMP represents a new process; add it to the process table. Create
734 the process table the first time it's called. */
736 static void
737 register_proc (struct tempnode *temp)
739 if (! proctab)
741 proctab = hash_initialize (INIT_PROCTAB_SIZE, nullptr,
742 proctab_hasher,
743 proctab_comparator,
744 nullptr);
745 if (! proctab)
746 xalloc_die ();
749 temp->state = UNREAPED;
751 if (! hash_insert (proctab, temp))
752 xalloc_die ();
755 /* If PID is in the process table, remove it and return true.
756 Otherwise, return false. */
758 static bool
759 delete_proc (pid_t pid)
761 struct tempnode test;
763 test.pid = pid;
764 struct tempnode *node = hash_remove (proctab, &test);
765 if (! node)
766 return false;
767 node->state = REAPED;
768 return true;
771 /* Remove PID from the process table, and wait for it to exit if it
772 hasn't already. */
774 static void
775 wait_proc (pid_t pid)
777 if (delete_proc (pid))
778 reap (pid);
781 /* Reap any exited children. Do not block; reap only those that have
782 already exited. */
784 static void
785 reap_exited (void)
787 while (0 < nprocs && reap (0))
788 continue;
791 /* Reap at least one exited child, waiting if necessary. */
793 static void
794 reap_some (void)
796 reap (-1);
797 reap_exited ();
800 /* Reap all children, waiting if necessary. */
802 static void
803 reap_all (void)
805 while (0 < nprocs)
806 reap (-1);
809 /* Clean up any remaining temporary files. */
811 static void
812 cleanup (void)
814 struct tempnode const *node;
816 for (node = temphead; node; node = node->next)
817 unlink (node->name);
818 temphead = nullptr;
821 /* Cleanup actions to take when exiting. */
823 static void
824 exit_cleanup (void)
826 if (temphead)
828 /* Clean up any remaining temporary files in a critical section so
829 that a signal handler does not try to clean them too. */
830 struct cs_status cs;
831 cs_enter (&cs);
832 cleanup ();
833 cs_leave (&cs);
836 close_stdout ();
839 /* Create a new temporary file, returning its newly allocated tempnode.
840 Store into *PFD the file descriptor open for writing.
841 If the creation fails, return nullptr and store -1 into *PFD if the
842 failure is due to file descriptor exhaustion and
843 SURVIVE_FD_EXHAUSTION; otherwise, die. */
845 static struct tempnode *
846 create_temp_file (int *pfd, bool survive_fd_exhaustion)
848 static char const slashbase[] = "/sortXXXXXX";
849 static idx_t temp_dir_index;
850 int fd;
851 int saved_errno;
852 char const *temp_dir = temp_dirs[temp_dir_index];
853 size_t len = strlen (temp_dir);
854 struct tempnode *node =
855 xmalloc (FLEXSIZEOF (struct tempnode, name, len + sizeof slashbase));
856 char *file = node->name;
857 struct cs_status cs;
859 memcpy (file, temp_dir, len);
860 memcpy (file + len, slashbase, sizeof slashbase);
861 node->next = nullptr;
862 if (++temp_dir_index == temp_dir_count)
863 temp_dir_index = 0;
865 /* Create the temporary file in a critical section, to avoid races. */
866 cs_enter (&cs);
867 fd = mkostemp (file, O_CLOEXEC);
868 if (0 <= fd)
870 *temptail = node;
871 temptail = &node->next;
873 saved_errno = errno;
874 cs_leave (&cs);
875 errno = saved_errno;
877 if (fd < 0)
879 if (! (survive_fd_exhaustion && errno == EMFILE))
880 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
881 quoteaf (temp_dir));
882 free (node);
883 node = nullptr;
886 *pfd = fd;
887 return node;
890 /* Return a pointer to stdout status, or nullptr on failure. */
892 static struct stat *
893 get_outstatus (void)
895 static int outstat_errno;
896 static struct stat outstat;
897 if (outstat_errno == 0)
898 outstat_errno = fstat (STDOUT_FILENO, &outstat) == 0 ? -1 : errno;
899 return outstat_errno < 0 ? &outstat : nullptr;
902 /* Return a stream for FILE, opened with mode HOW. If HOW is "w",
903 the file is already open on standard output, and needs to be
904 truncated unless FILE is null. When opening for input, "-"
905 means standard input. To avoid confusion, do not return file
906 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
907 opening an ordinary FILE. Return nullptr if unsuccessful.
909 Use fadvise to specify an access pattern for input files.
910 There are a few hints we could possibly provide,
911 and after careful testing it was decided that
912 specifying FADVISE_SEQUENTIAL was not detrimental
913 to any cases. On Linux 2.6.31, this option doubles
914 the size of read ahead performed and thus was seen to
915 benefit these cases:
916 Merging
917 Sorting with a smaller internal buffer
918 Reading from faster flash devices
920 In _addition_ one could also specify other hints...
922 FADVISE_WILLNEED was tested, but Linux 2.6.31
923 at least uses that to _synchronously_ prepopulate the cache
924 with the specified range. While sort does need to
925 read all of its input before outputting, a synchronous
926 read of the whole file up front precludes any processing
927 that sort could do in parallel with the system doing
928 read ahead of the data. This was seen to have negative effects
929 in a couple of cases:
930 Merging
931 Sorting with a smaller internal buffer
932 This option was seen to shorten the runtime for sort
933 on a multicore system with lots of RAM and other processes
934 competing for CPU. It could be argued that more explicit
935 scheduling hints with 'nice' et. al. are more appropriate
936 for this situation.
938 FADVISE_NOREUSE is a possibility as it could lower
939 the priority of input data in the cache as sort will
940 only need to process it once. However its functionality
941 has changed over Linux kernel versions and as of 2.6.31
942 it does nothing and thus we can't depend on what it might
943 do in future.
945 FADVISE_DONTNEED is not appropriate for user specified
946 input files, but for temp files we do want to drop the
947 cache immediately after processing. This is done implicitly
948 however when the files are unlinked. */
950 static FILE *
951 stream_open (char const *file, char const *how)
953 FILE *fp;
955 if (*how == 'r')
957 if (STREQ (file, "-"))
959 have_read_stdin = true;
960 fp = stdin;
962 else
964 int fd = open (file, O_RDONLY | O_CLOEXEC);
965 fp = fd < 0 ? nullptr : fdopen (fd, how);
967 fadvise (fp, FADVISE_SEQUENTIAL);
969 else if (*how == 'w')
971 if (file && ftruncate (STDOUT_FILENO, 0) != 0)
973 int ftruncate_errno = errno;
974 struct stat *outst = get_outstatus ();
975 if (!outst || S_ISREG (outst->st_mode) || S_TYPEISSHM (outst))
976 error (SORT_FAILURE, ftruncate_errno, _("%s: error truncating"),
977 quotef (file));
979 fp = stdout;
981 else
982 affirm (!"unexpected mode passed to stream_open");
984 return fp;
987 /* Same as stream_open, except always return a non-null value; die on
988 failure. */
990 static FILE *
991 xfopen (char const *file, char const *how)
993 FILE *fp = stream_open (file, how);
994 if (!fp)
995 sort_die (_("open failed"), file);
996 return fp;
999 /* Close FP, whose name is FILE, and report any errors. */
1001 static void
1002 xfclose (FILE *fp, char const *file)
1004 switch (fileno (fp))
1006 case STDIN_FILENO:
1007 /* Allow reading stdin from tty more than once. */
1008 clearerr (fp);
1009 break;
1011 case STDOUT_FILENO:
1012 /* Don't close stdout just yet. close_stdout does that. */
1013 if (fflush (fp) != 0)
1014 sort_die (_("fflush failed"), file);
1015 break;
1017 default:
1018 if (fclose (fp) != 0)
1019 sort_die (_("close failed"), file);
1020 break;
1024 /* Move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not close-on-exec. */
1026 static void
1027 move_fd (int oldfd, int newfd)
1029 if (oldfd != newfd)
1031 /* These should never fail for our usage. */
1032 ignore_value (dup2 (oldfd, newfd));
1033 ignore_value (close (oldfd));
1037 /* Fork a child process for piping to and do common cleanup. The
1038 TRIES parameter specifies how many times to try to fork before
1039 giving up. Return the PID of the child, or -1 (setting errno)
1040 on failure. */
1042 static pid_t
1043 pipe_fork (int pipefds[2], size_t tries)
1045 #if HAVE_WORKING_FORK
1046 struct tempnode *saved_temphead;
1047 int saved_errno;
1048 double wait_retry = 0.25;
1049 pid_t pid;
1050 struct cs_status cs;
1052 if (pipe2 (pipefds, O_CLOEXEC) < 0)
1053 return -1;
1055 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1056 uncontrolled subprocess generation can hurt performance significantly.
1057 Allow at most NMERGE + 2 subprocesses, on the theory that there
1058 may be some useful parallelism by letting compression for the
1059 previous merge finish (1 subprocess) in parallel with the current
1060 merge (NMERGE + 1 subprocesses). */
1062 if (nmerge + 1 < nprocs)
1063 reap_some ();
1065 while (tries--)
1067 /* This is so the child process won't delete our temp files
1068 if it receives a signal before exec-ing. */
1069 cs_enter (&cs);
1070 saved_temphead = temphead;
1071 temphead = nullptr;
1073 pid = fork ();
1074 saved_errno = errno;
1075 if (pid)
1076 temphead = saved_temphead;
1078 cs_leave (&cs);
1079 errno = saved_errno;
1081 if (0 <= pid || errno != EAGAIN)
1082 break;
1083 else
1085 xnanosleep (wait_retry);
1086 wait_retry *= 2;
1087 reap_exited ();
1091 if (pid < 0)
1093 saved_errno = errno;
1094 close (pipefds[0]);
1095 close (pipefds[1]);
1096 errno = saved_errno;
1098 else if (pid == 0)
1100 close (STDIN_FILENO);
1101 close (STDOUT_FILENO);
1103 else
1104 ++nprocs;
1106 return pid;
1108 #else /* ! HAVE_WORKING_FORK */
1109 return -1;
1110 #endif
1113 /* Create a temporary file and, if asked for, start a compressor
1114 to that file. Set *PFP to the file handle and return
1115 the address of the new temp node. If the creation
1116 fails, return nullptr if the failure is due to file descriptor
1117 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1119 static struct tempnode *
1120 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1122 int tempfd;
1123 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1124 if (! node)
1125 return nullptr;
1127 node->state = UNCOMPRESSED;
1129 if (compress_program)
1131 int pipefds[2];
1133 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1134 if (0 < node->pid)
1136 close (tempfd);
1137 close (pipefds[0]);
1138 tempfd = pipefds[1];
1140 register_proc (node);
1142 else if (node->pid == 0)
1144 /* Being the child of a multithreaded program before exec,
1145 we're restricted to calling async-signal-safe routines here. */
1146 close (pipefds[1]);
1147 move_fd (tempfd, STDOUT_FILENO);
1148 move_fd (pipefds[0], STDIN_FILENO);
1150 execlp (compress_program, compress_program, (char *) nullptr);
1152 async_safe_die (errno, "couldn't execute compress program");
1156 *pfp = fdopen (tempfd, "w");
1157 if (! *pfp)
1158 sort_die (_("couldn't create temporary file"), node->name);
1160 return node;
1163 /* Create a temporary file and, if asked for, start a compressor
1164 to that file. Set *PFP to the file handle and return the address
1165 of the new temp node. Die on failure. */
1167 static struct tempnode *
1168 create_temp (FILE **pfp)
1170 return maybe_create_temp (pfp, false);
1173 /* Open a compressed temp file and start a decompression process through
1174 which to filter the input. Return nullptr (setting errno to
1175 EMFILE) if we ran out of file descriptors, and die on any other
1176 kind of failure. */
1178 static FILE *
1179 open_temp (struct tempnode *temp)
1181 int tempfd, pipefds[2];
1182 FILE *fp = nullptr;
1184 if (temp->state == UNREAPED)
1185 wait_proc (temp->pid);
1187 tempfd = open (temp->name, O_RDONLY);
1188 if (tempfd < 0)
1189 return nullptr;
1191 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1193 switch (child)
1195 case -1:
1196 if (errno != EMFILE)
1197 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1198 quoteaf (compress_program));
1199 close (tempfd);
1200 errno = EMFILE;
1201 break;
1203 case 0:
1204 /* Being the child of a multithreaded program before exec,
1205 we're restricted to calling async-signal-safe routines here. */
1206 close (pipefds[0]);
1207 move_fd (tempfd, STDIN_FILENO);
1208 move_fd (pipefds[1], STDOUT_FILENO);
1210 execlp (compress_program, compress_program, "-d", (char *) nullptr);
1212 async_safe_die (errno, "couldn't execute compress program (with -d)");
1214 default:
1215 temp->pid = child;
1216 register_proc (temp);
1217 close (tempfd);
1218 close (pipefds[1]);
1220 fp = fdopen (pipefds[0], "r");
1221 if (! fp)
1223 int saved_errno = errno;
1224 close (pipefds[0]);
1225 errno = saved_errno;
1227 break;
1230 return fp;
1233 /* Append DIR to the array of temporary directory names. */
1234 static void
1235 add_temp_dir (char const *dir)
1237 if (temp_dir_count == temp_dir_alloc)
1238 temp_dirs = xpalloc (temp_dirs, &temp_dir_alloc, 1, -1, sizeof *temp_dirs);
1240 temp_dirs[temp_dir_count++] = dir;
1243 /* Remove NAME from the list of temporary files. */
1245 static void
1246 zaptemp (char const *name)
1248 struct tempnode *volatile *pnode;
1249 struct tempnode *node;
1250 struct tempnode *next;
1251 int unlink_status;
1252 int unlink_errno = 0;
1253 struct cs_status cs;
1255 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1256 continue;
1258 if (node->state == UNREAPED)
1259 wait_proc (node->pid);
1261 /* Unlink the temporary file in a critical section to avoid races. */
1262 next = node->next;
1263 cs_enter (&cs);
1264 unlink_status = unlink (name);
1265 unlink_errno = errno;
1266 *pnode = next;
1267 cs_leave (&cs);
1269 if (unlink_status != 0)
1270 error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
1271 if (! next)
1272 temptail = pnode;
1273 free (node);
1276 #if HAVE_NL_LANGINFO
1278 static int
1279 struct_month_cmp (void const *m1, void const *m2)
1281 struct month const *month1 = m1;
1282 struct month const *month2 = m2;
1283 return strcmp (month1->name, month2->name);
1286 #endif
1288 /* Initialize the character class tables. */
1290 static void
1291 inittables (void)
1293 size_t i;
1295 for (i = 0; i < UCHAR_LIM; ++i)
1297 blanks[i] = i == '\n' || isblank (i);
1298 nondictionary[i] = ! blanks[i] && ! isalnum (i);
1299 nonprinting[i] = ! isprint (i);
1300 fold_toupper[i] = toupper (i);
1303 #if HAVE_NL_LANGINFO
1304 /* If we're not in the "C" locale, read different names for months. */
1305 if (hard_LC_TIME)
1307 for (i = 0; i < MONTHS_PER_YEAR; i++)
1309 char const *s;
1310 size_t s_len;
1311 size_t j, k;
1312 char *name;
1314 s = nl_langinfo (ABMON_1 + i);
1315 s_len = strlen (s);
1316 monthtab[i].name = name = xmalloc (s_len + 1);
1317 monthtab[i].val = i + 1;
1319 for (j = k = 0; j < s_len; j++)
1320 if (! isblank (to_uchar (s[j])))
1321 name[k++] = fold_toupper[to_uchar (s[j])];
1322 name[k] = '\0';
1324 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1326 #endif
1329 /* Specify how many inputs may be merged at once.
1330 This may be set on the command-line with the
1331 --batch-size option. */
1332 static void
1333 specify_nmerge (int oi, char c, char const *s)
1335 uintmax_t n;
1336 struct rlimit rlimit;
1337 enum strtol_error e = xstrtoumax (s, nullptr, 10, &n, "");
1339 /* Try to find out how many file descriptors we'll be able
1340 to open. We need at least nmerge + 3 (STDIN_FILENO,
1341 STDOUT_FILENO and STDERR_FILENO). */
1342 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1343 ? rlimit.rlim_cur
1344 : OPEN_MAX)
1345 - 3);
1347 if (e == LONGINT_OK)
1349 nmerge = n;
1350 if (nmerge != n)
1351 e = LONGINT_OVERFLOW;
1352 else
1354 if (nmerge < 2)
1356 error (0, 0, _("invalid --%s argument %s"),
1357 long_options[oi].name, quote (s));
1358 error (SORT_FAILURE, 0,
1359 _("minimum --%s argument is %s"),
1360 long_options[oi].name, quote ("2"));
1362 else if (max_nmerge < nmerge)
1364 e = LONGINT_OVERFLOW;
1366 else
1367 return;
1371 if (e == LONGINT_OVERFLOW)
1373 error (0, 0, _("--%s argument %s too large"),
1374 long_options[oi].name, quote (s));
1375 error (SORT_FAILURE, 0,
1376 _("maximum --%s argument with current rlimit is %u"),
1377 long_options[oi].name, max_nmerge);
1379 else
1380 xstrtol_fatal (e, oi, c, long_options, s);
1383 /* Specify the amount of main memory to use when sorting. */
1384 static void
1385 specify_sort_size (int oi, char c, char const *s)
1387 uintmax_t n;
1388 char *suffix;
1389 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPQRtTYZ");
1391 /* The default unit is KiB. */
1392 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1394 if (n <= UINTMAX_MAX / 1024)
1395 n *= 1024;
1396 else
1397 e = LONGINT_OVERFLOW;
1400 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1401 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1402 switch (suffix[0])
1404 case 'b':
1405 e = LONGINT_OK;
1406 break;
1408 case '%':
1410 double mem = physmem_total () * n / 100;
1412 /* Use "<", not "<=", to avoid problems with rounding. */
1413 if (mem < UINTMAX_MAX)
1415 n = mem;
1416 e = LONGINT_OK;
1418 else
1419 e = LONGINT_OVERFLOW;
1421 break;
1424 if (e == LONGINT_OK)
1426 /* If multiple sort sizes are specified, take the maximum, so
1427 that option order does not matter. */
1428 if (n < sort_size)
1429 return;
1431 sort_size = n;
1432 if (sort_size == n)
1434 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1435 return;
1438 e = LONGINT_OVERFLOW;
1441 xstrtol_fatal (e, oi, c, long_options, s);
1444 /* Specify the number of threads to spawn during internal sort. */
1445 static size_t
1446 specify_nthreads (int oi, char c, char const *s)
1448 uintmax_t nthreads;
1449 enum strtol_error e = xstrtoumax (s, nullptr, 10, &nthreads, "");
1450 if (e == LONGINT_OVERFLOW)
1451 return SIZE_MAX;
1452 if (e != LONGINT_OK)
1453 xstrtol_fatal (e, oi, c, long_options, s);
1454 if (SIZE_MAX < nthreads)
1455 nthreads = SIZE_MAX;
1456 if (nthreads == 0)
1457 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1458 return nthreads;
1461 /* Return the default sort size. */
1462 static size_t
1463 default_sort_size (void)
1465 /* Let SIZE be MEM, but no more than the maximum object size,
1466 total memory, or system resource limits. Don't bother to check
1467 for values like RLIM_INFINITY since in practice they are not much
1468 less than SIZE_MAX. */
1469 size_t size = SIZE_MAX;
1470 struct rlimit rlimit;
1471 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1472 size = rlimit.rlim_cur;
1473 #ifdef RLIMIT_AS
1474 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1475 size = rlimit.rlim_cur;
1476 #endif
1478 /* Leave a large safety margin for the above limits, as failure can
1479 occur when they are exceeded. */
1480 size /= 2;
1482 #ifdef RLIMIT_RSS
1483 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1484 Exceeding RSS is not fatal, but can be quite slow. */
1485 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1486 size = rlimit.rlim_cur / 16 * 15;
1487 #endif
1489 /* Let MEM be available memory or 1/8 of total memory, whichever
1490 is greater. */
1491 double avail = physmem_available ();
1492 double total = physmem_total ();
1493 double mem = MAX (avail, total / 8);
1495 /* Leave a 1/4 margin for physical memory. */
1496 if (total * 0.75 < size)
1497 size = total * 0.75;
1499 /* Return the minimum of MEM and SIZE, but no less than
1500 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1501 right when only one argument is floating point. */
1502 if (mem < size)
1503 size = mem;
1504 return MAX (size, MIN_SORT_SIZE);
1507 /* Return the sort buffer size to use with the input files identified
1508 by FPS and FILES, which are alternate names of the same files.
1509 NFILES gives the number of input files; NFPS may be less. Assume
1510 that each input line requires LINE_BYTES extra bytes' worth of line
1511 information. Do not exceed the size bound specified by the user
1512 (or a default size bound, if the user does not specify one). */
1514 static size_t
1515 sort_buffer_size (FILE *const *fps, size_t nfps,
1516 char *const *files, size_t nfiles,
1517 size_t line_bytes)
1519 /* A bound on the input size. If zero, the bound hasn't been
1520 determined yet. */
1521 static size_t size_bound;
1523 /* In the worst case, each input byte is a newline. */
1524 size_t worst_case_per_input_byte = line_bytes + 1;
1526 /* Keep enough room for one extra input line and an extra byte.
1527 This extra room might be needed when preparing to read EOF. */
1528 size_t size = worst_case_per_input_byte + 1;
1530 for (size_t i = 0; i < nfiles; i++)
1532 struct stat st;
1533 off_t file_size;
1534 size_t worst_case;
1536 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1537 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1538 : stat (files[i], &st))
1539 != 0)
1540 sort_die (_("stat failed"), files[i]);
1542 if (usable_st_size (&st) && 0 < st.st_size)
1543 file_size = st.st_size;
1544 else
1546 /* The file has unknown size. If the user specified a sort
1547 buffer size, use that; otherwise, guess the size. */
1548 if (sort_size)
1549 return sort_size;
1550 file_size = INPUT_FILE_SIZE_GUESS;
1553 if (! size_bound)
1555 size_bound = sort_size;
1556 if (! size_bound)
1557 size_bound = default_sort_size ();
1560 /* Add the amount of memory needed to represent the worst case
1561 where the input consists entirely of newlines followed by a
1562 single non-newline. Check for overflow. */
1563 worst_case = file_size * worst_case_per_input_byte + 1;
1564 if (file_size != worst_case / worst_case_per_input_byte
1565 || size_bound - size <= worst_case)
1566 return size_bound;
1567 size += worst_case;
1570 return size;
1573 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1574 must be at least sizeof (struct line). Allocate ALLOC bytes
1575 initially. */
1577 static void
1578 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1580 /* Ensure that the line array is properly aligned. If the desired
1581 size cannot be allocated, repeatedly halve it until allocation
1582 succeeds. The smaller allocation may hurt overall performance,
1583 but that's better than failing. */
1584 while (true)
1586 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1587 buf->buf = malloc (alloc);
1588 if (buf->buf)
1589 break;
1590 alloc /= 2;
1591 if (alloc <= line_bytes + 1)
1592 xalloc_die ();
1595 buf->line_bytes = line_bytes;
1596 buf->alloc = alloc;
1597 buf->used = buf->left = buf->nlines = 0;
1598 buf->eof = false;
1601 /* Return one past the limit of the line array. */
1603 static inline struct line *
1604 buffer_linelim (struct buffer const *buf)
1606 void *linelim = buf->buf + buf->alloc;
1607 return linelim;
1610 /* Return a pointer to the first character of the field specified
1611 by KEY in LINE. */
1613 static char *
1614 begfield (struct line const *line, struct keyfield const *key)
1616 char *ptr = line->text, *lim = ptr + line->length - 1;
1617 size_t sword = key->sword;
1618 size_t schar = key->schar;
1620 /* The leading field separator itself is included in a field when -t
1621 is absent. */
1623 if (tab != TAB_DEFAULT)
1624 while (ptr < lim && sword--)
1626 while (ptr < lim && *ptr != tab)
1627 ++ptr;
1628 if (ptr < lim)
1629 ++ptr;
1631 else
1632 while (ptr < lim && sword--)
1634 while (ptr < lim && blanks[to_uchar (*ptr)])
1635 ++ptr;
1636 while (ptr < lim && !blanks[to_uchar (*ptr)])
1637 ++ptr;
1640 /* If we're ignoring leading blanks when computing the Start
1641 of the field, skip past them here. */
1642 if (key->skipsblanks)
1643 while (ptr < lim && blanks[to_uchar (*ptr)])
1644 ++ptr;
1646 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1647 ptr = MIN (lim, ptr + schar);
1649 return ptr;
1652 /* Return the limit of (a pointer to the first character after) the field
1653 in LINE specified by KEY. */
1655 ATTRIBUTE_PURE
1656 static char *
1657 limfield (struct line const *line, struct keyfield const *key)
1659 char *ptr = line->text, *lim = ptr + line->length - 1;
1660 size_t eword = key->eword, echar = key->echar;
1662 if (echar == 0)
1663 eword++; /* Skip all of end field. */
1665 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1666 whichever comes first. If there are more than EWORD fields, leave
1667 PTR pointing at the beginning of the field having zero-based index,
1668 EWORD. If a delimiter character was specified (via -t), then that
1669 'beginning' is the first character following the delimiting TAB.
1670 Otherwise, leave PTR pointing at the first 'blank' character after
1671 the preceding field. */
1672 if (tab != TAB_DEFAULT)
1673 while (ptr < lim && eword--)
1675 while (ptr < lim && *ptr != tab)
1676 ++ptr;
1677 if (ptr < lim && (eword || echar))
1678 ++ptr;
1680 else
1681 while (ptr < lim && eword--)
1683 while (ptr < lim && blanks[to_uchar (*ptr)])
1684 ++ptr;
1685 while (ptr < lim && !blanks[to_uchar (*ptr)])
1686 ++ptr;
1689 #ifdef POSIX_UNSPECIFIED
1690 /* The following block of code makes GNU sort incompatible with
1691 standard Unix sort, so it's ifdef'd out for now.
1692 The POSIX spec isn't clear on how to interpret this.
1693 FIXME: request clarification.
1695 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1696 Date: Thu, 30 May 96 12:20:41 -0400
1697 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1699 [...]I believe I've found another bug in 'sort'.
1701 $ cat /tmp/sort.in
1702 a b c 2 d
1703 pq rs 1 t
1704 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1705 a b c 2 d
1706 pq rs 1 t
1707 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1708 pq rs 1 t
1709 a b c 2 d
1711 Unix sort produced the answer I expected: sort on the single character
1712 in column 7. GNU sort produced different results, because it disagrees
1713 on the interpretation of the key-end spec "M.N". Unix sort reads this
1714 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1715 "skip M-1 fields, then either N-1 characters or the rest of the current
1716 field, whichever comes first". This extra clause applies only to
1717 key-ends, not key-starts.
1720 /* Make LIM point to the end of (one byte past) the current field. */
1721 if (tab != TAB_DEFAULT)
1723 char *newlim;
1724 newlim = memchr (ptr, tab, lim - ptr);
1725 if (newlim)
1726 lim = newlim;
1728 else
1730 char *newlim;
1731 newlim = ptr;
1732 while (newlim < lim && blanks[to_uchar (*newlim)])
1733 ++newlim;
1734 while (newlim < lim && !blanks[to_uchar (*newlim)])
1735 ++newlim;
1736 lim = newlim;
1738 #endif
1740 if (echar != 0) /* We need to skip over a portion of the end field. */
1742 /* If we're ignoring leading blanks when computing the End
1743 of the field, skip past them here. */
1744 if (key->skipeblanks)
1745 while (ptr < lim && blanks[to_uchar (*ptr)])
1746 ++ptr;
1748 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1749 ptr = MIN (lim, ptr + echar);
1752 return ptr;
1755 /* Fill BUF reading from FP, moving buf->left bytes from the end
1756 of buf->buf to the beginning first. If EOF is reached and the
1757 file wasn't terminated by a newline, supply one. Set up BUF's line
1758 table too. FILE is the name of the file corresponding to FP.
1759 Return true if some input was read. */
1761 static bool
1762 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1764 struct keyfield const *key = keylist;
1765 char eol = eolchar;
1766 size_t line_bytes = buf->line_bytes;
1767 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1769 if (buf->eof)
1770 return false;
1772 if (buf->used != buf->left)
1774 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1775 buf->used = buf->left;
1776 buf->nlines = 0;
1779 while (true)
1781 char *ptr = buf->buf + buf->used;
1782 struct line *linelim = buffer_linelim (buf);
1783 struct line *line = linelim - buf->nlines;
1784 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1785 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1787 while (line_bytes + 1 < avail)
1789 /* Read as many bytes as possible, but do not read so many
1790 bytes that there might not be enough room for the
1791 corresponding line array. The worst case is when the
1792 rest of the input file consists entirely of newlines,
1793 except that the last byte is not a newline. */
1794 size_t readsize = (avail - 1) / (line_bytes + 1);
1795 size_t bytes_read = fread (ptr, 1, readsize, fp);
1796 char *ptrlim = ptr + bytes_read;
1797 char *p;
1798 avail -= bytes_read;
1800 if (bytes_read != readsize)
1802 if (ferror (fp))
1803 sort_die (_("read failed"), file);
1804 if (feof (fp))
1806 buf->eof = true;
1807 if (buf->buf == ptrlim)
1808 return false;
1809 if (line_start != ptrlim && ptrlim[-1] != eol)
1810 *ptrlim++ = eol;
1814 /* Find and record each line in the just-read input. */
1815 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1817 /* Delimit the line with NUL. This eliminates the need to
1818 temporarily replace the last byte with NUL when calling
1819 xmemcoll, which increases performance. */
1820 *p = '\0';
1821 ptr = p + 1;
1822 line--;
1823 line->text = line_start;
1824 line->length = ptr - line_start;
1825 mergesize = MAX (mergesize, line->length);
1826 avail -= line_bytes;
1828 if (key)
1830 /* Precompute the position of the first key for
1831 efficiency. */
1832 line->keylim = (key->eword == SIZE_MAX
1834 : limfield (line, key));
1836 if (key->sword != SIZE_MAX)
1837 line->keybeg = begfield (line, key);
1838 else
1840 if (key->skipsblanks)
1841 while (blanks[to_uchar (*line_start)])
1842 line_start++;
1843 line->keybeg = line_start;
1847 line_start = ptr;
1850 ptr = ptrlim;
1851 if (buf->eof)
1852 break;
1855 buf->used = ptr - buf->buf;
1856 buf->nlines = buffer_linelim (buf) - line;
1857 if (buf->nlines != 0)
1859 buf->left = ptr - line_start;
1860 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1861 return true;
1865 /* The current input line is too long to fit in the buffer.
1866 Increase the buffer size and try again, keeping it properly
1867 aligned. */
1868 idx_t line_alloc = buf->alloc / sizeof (struct line);
1869 buf->buf = xpalloc (buf->buf, &line_alloc, 1, -1, sizeof (struct line));
1870 buf->alloc = line_alloc * sizeof (struct line);
1875 /* Table that maps characters to order-of-magnitude values. */
1876 static char const unit_order[UCHAR_LIM] =
1878 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1879 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'R' == 82 && 'Q' == 81 \
1880 && 'k' == 107)
1881 /* This initializer syntax works on all C99 hosts. For now, use
1882 it only on non-ASCII hosts, to ease the pain of porting to
1883 pre-C99 ASCII hosts. */
1884 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1885 ['R']=9, ['Q']=10,
1886 ['k']=1,
1887 #else
1888 /* Generate the following table with this command:
1889 perl -e 'my %a=(k=>1,K=>1,M=>2,G=>3,T=>4,P=>5,E=>6,Z=>7,Y=>8,R=>9,Q=>10);
1890 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1891 |fmt */
1892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1895 0, 0, 0, 1, 0, 2, 0, 0, 5, 10, 9, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1903 #endif
1906 /* Traverse number given as *number consisting of digits, thousands_sep, and
1907 decimal_point chars only. Returns the highest digit found in the number,
1908 or '\0' if no digit has been found. Upon return *number points at the
1909 character that immediately follows after the given number. */
1910 static char
1911 traverse_raw_number (char const **number)
1913 char const *p = *number;
1914 char ch;
1915 char max_digit = '\0';
1916 bool ends_with_thousands_sep = false;
1918 /* Scan to end of number.
1919 Decimals or separators not followed by digits stop the scan.
1920 Numbers ending in decimals or separators are thus considered
1921 to be lacking in units.
1922 FIXME: add support for multibyte thousands_sep and decimal_point. */
1924 while (ISDIGIT (ch = *p++))
1926 if (max_digit < ch)
1927 max_digit = ch;
1929 /* Allow to skip only one occurrence of thousands_sep to avoid finding
1930 the unit in the next column in case thousands_sep matches as blank
1931 and is used as column delimiter. */
1932 ends_with_thousands_sep = (*p == thousands_sep);
1933 if (ends_with_thousands_sep)
1934 ++p;
1937 if (ends_with_thousands_sep)
1939 /* thousands_sep not followed by digit is not allowed. */
1940 *number = p - 2;
1941 return max_digit;
1944 if (ch == decimal_point)
1945 while (ISDIGIT (ch = *p++))
1946 if (max_digit < ch)
1947 max_digit = ch;
1949 *number = p - 1;
1950 return max_digit;
1953 /* Return an integer that represents the order of magnitude of the
1954 unit following the number. The number may contain thousands
1955 separators and a decimal point, but it may not contain leading blanks.
1956 Negative numbers get negative orders; zero numbers have a zero order. */
1958 ATTRIBUTE_PURE
1959 static int
1960 find_unit_order (char const *number)
1962 bool minus_sign = (*number == '-');
1963 char const *p = number + minus_sign;
1964 char max_digit = traverse_raw_number (&p);
1965 if ('0' < max_digit)
1967 unsigned char ch = *p;
1968 int order = unit_order[ch];
1969 return (minus_sign ? -order : order);
1971 else
1972 return 0;
1975 /* Compare numbers A and B ending in units with SI or IEC prefixes
1976 <none/unknown> < K/k < M < G < T < P < E < Z < Y < R < Q */
1978 ATTRIBUTE_PURE
1979 static int
1980 human_numcompare (char const *a, char const *b)
1982 while (blanks[to_uchar (*a)])
1983 a++;
1984 while (blanks[to_uchar (*b)])
1985 b++;
1987 int diff = find_unit_order (a) - find_unit_order (b);
1988 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1991 /* Compare strings A and B as numbers without explicitly converting them to
1992 machine numbers. Comparatively slow for short strings, but asymptotically
1993 hideously fast. */
1995 ATTRIBUTE_PURE
1996 static int
1997 numcompare (char const *a, char const *b)
1999 while (blanks[to_uchar (*a)])
2000 a++;
2001 while (blanks[to_uchar (*b)])
2002 b++;
2004 return strnumcmp (a, b, decimal_point, thousands_sep);
2007 static int
2008 nan_compare (long double a, long double b)
2010 char buf[2][sizeof "-nan""()" + CHAR_BIT * sizeof a];
2011 snprintf (buf[0], sizeof buf[0], "%Lf", a);
2012 snprintf (buf[1], sizeof buf[1], "%Lf", b);
2013 return strcmp (buf[0], buf[1]);
2016 static int
2017 general_numcompare (char const *sa, char const *sb)
2019 /* FIXME: maybe add option to try expensive FP conversion
2020 only if A and B can't be compared more cheaply/accurately. */
2022 char *ea;
2023 char *eb;
2024 long double a = strtold (sa, &ea);
2025 long double b = strtold (sb, &eb);
2027 /* Put conversion errors at the start of the collating sequence. */
2028 if (sa == ea)
2029 return sb == eb ? 0 : -1;
2030 if (sb == eb)
2031 return 1;
2033 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
2034 conversion errors but before numbers; sort them by internal
2035 bit-pattern, for lack of a more portable alternative. */
2036 return (a < b ? -1
2037 : a > b ? 1
2038 : a == b ? 0
2039 : b == b ? -1
2040 : a == a ? 1
2041 : nan_compare (a, b));
2044 /* Return an integer in 1..12 of the month name MONTH.
2045 Return 0 if the name in S is not recognized. */
2047 static int
2048 getmonth (char const *month, char **ea)
2050 size_t lo = 0;
2051 size_t hi = MONTHS_PER_YEAR;
2053 while (blanks[to_uchar (*month)])
2054 month++;
2058 size_t ix = (lo + hi) / 2;
2059 char const *m = month;
2060 char const *n = monthtab[ix].name;
2062 for (;; m++, n++)
2064 if (!*n)
2066 if (ea)
2067 *ea = (char *) m;
2068 return monthtab[ix].val;
2070 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2072 hi = ix;
2073 break;
2075 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2077 lo = ix + 1;
2078 break;
2082 while (lo < hi);
2084 return 0;
2087 /* When using the OpenSSL implementation, dynamically link only if -R.
2088 This saves startup time in the usual (sans -R) case. */
2090 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2091 /* In the typical case where md5.h does not #undef HAVE_OPENSSL_MD5,
2092 trick md5.h into declaring and using pointers to functions not functions.
2093 This causes the compiler's -lcrypto option to have no effect,
2094 as sort.o no longer uses any crypto symbols statically. */
2096 # if 14 <= __GNUC__
2097 # pragma GCC diagnostic push
2098 # pragma GCC diagnostic ignored "-Wmissing-variable-declarations"
2099 # endif
2100 # define MD5_Init (*ptr_MD5_Init)
2101 # define MD5_Update (*ptr_MD5_Update)
2102 # define MD5_Final (*ptr_MD5_Final)
2103 #endif
2105 #include "md5.h"
2107 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2108 # if 14 <= __GNUC__
2109 # pragma GCC diagnostic pop
2110 # endif
2111 # include <dlfcn.h>
2113 /* Diagnose a dynamic linking failure. */
2114 static void
2115 link_failure (void)
2117 error (SORT_FAILURE, 0, "%s", dlerror ());
2120 /* Return a function pointer in HANDLE for SYMBOL. */
2121 static void *
2122 symbol_address (void *handle, char const *symbol)
2124 void *address = dlsym (handle, symbol);
2125 if (!address)
2126 link_failure ();
2127 return address;
2129 #endif
2131 /* Dynamically link the crypto library, if it needs linking. */
2132 static void
2133 link_libcrypto (void)
2135 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2136 void *handle = dlopen (LIBCRYPTO_SONAME, RTLD_LAZY | RTLD_GLOBAL);
2137 if (!handle)
2138 link_failure ();
2139 ptr_MD5_Init = symbol_address (handle, "MD5_Init");
2140 ptr_MD5_Update = symbol_address (handle, "MD5_Update");
2141 ptr_MD5_Final = symbol_address (handle, "MD5_Final");
2142 #endif
2145 /* A randomly chosen MD5 state, used for random comparison. */
2146 static struct md5_ctx random_md5_state;
2148 /* Initialize the randomly chosen MD5 state. */
2150 static void
2151 random_md5_state_init (char const *random_source)
2153 unsigned char buf[MD5_DIGEST_SIZE];
2154 struct randread_source *r = randread_new (random_source, sizeof buf);
2155 if (! r)
2156 sort_die (_("open failed"), random_source ? random_source : "getrandom");
2157 randread (r, buf, sizeof buf);
2158 if (randread_free (r) != 0)
2159 sort_die (_("close failed"), random_source);
2160 link_libcrypto ();
2161 md5_init_ctx (&random_md5_state);
2162 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2165 /* This is like strxfrm, except it reports any error and exits. */
2167 static size_t
2168 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2170 errno = 0;
2171 size_t translated_size = strxfrm (dest, src, destsize);
2173 if (errno)
2175 error (0, errno, _("string transformation failed"));
2176 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2177 error (SORT_FAILURE, 0,
2178 _("the original string was %s"),
2179 quotearg_n_style (0, locale_quoting_style, src));
2182 return translated_size;
2185 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2186 using one or more random hash functions. TEXTA[LENA] and
2187 TEXTB[LENB] must be zero. */
2189 static int
2190 compare_random (char *restrict texta, size_t lena,
2191 char *restrict textb, size_t lenb)
2193 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2194 data. This is used to break ties if there is a checksum
2195 collision, and this is good enough given the astronomically low
2196 probability of a collision. */
2197 int xfrm_diff = 0;
2199 char stackbuf[4000];
2200 char *buf = stackbuf;
2201 size_t bufsize = sizeof stackbuf;
2202 void *allocated = nullptr;
2203 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2204 struct md5_ctx s[2];
2205 s[0] = s[1] = random_md5_state;
2207 if (hard_LC_COLLATE)
2209 char const *lima = texta + lena;
2210 char const *limb = textb + lenb;
2212 while (true)
2214 /* Transform the text into the basis of comparison, so that byte
2215 strings that would otherwise considered to be equal are
2216 considered equal here even if their bytes differ.
2218 Each time through this loop, transform one
2219 null-terminated string's worth from TEXTA or from TEXTB
2220 or both. That way, there's no need to store the
2221 transformation of the whole line, if it contains many
2222 null-terminated strings. */
2224 /* Store the transformed data into a big-enough buffer. */
2226 /* A 3X size guess avoids the overhead of calling strxfrm
2227 twice on typical implementations. Don't worry about
2228 size_t overflow, as the guess need not be correct. */
2229 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2230 if (bufsize < guess_bufsize)
2232 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2233 free (allocated);
2234 buf = allocated = malloc (bufsize);
2235 if (! buf)
2237 buf = stackbuf;
2238 bufsize = sizeof stackbuf;
2242 size_t sizea =
2243 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2244 bool a_fits = sizea <= bufsize;
2245 size_t sizeb =
2246 (textb < limb
2247 ? (xstrxfrm ((a_fits ? buf + sizea : nullptr), textb,
2248 (a_fits ? bufsize - sizea : 0))
2249 + 1)
2250 : 0);
2252 if (! (a_fits && sizea + sizeb <= bufsize))
2254 bufsize = sizea + sizeb;
2255 if (bufsize < SIZE_MAX / 3)
2256 bufsize = bufsize * 3 / 2;
2257 free (allocated);
2258 buf = allocated = xmalloc (bufsize);
2259 if (texta < lima)
2260 strxfrm (buf, texta, sizea);
2261 if (textb < limb)
2262 strxfrm (buf + sizea, textb, sizeb);
2265 /* Advance past NULs to the next part of each input string,
2266 exiting the loop if both strings are exhausted. When
2267 exiting the loop, prepare to finish off the tiebreaker
2268 comparison properly. */
2269 if (texta < lima)
2270 texta += strlen (texta) + 1;
2271 if (textb < limb)
2272 textb += strlen (textb) + 1;
2273 if (! (texta < lima || textb < limb))
2275 lena = sizea; texta = buf;
2276 lenb = sizeb; textb = buf + sizea;
2277 break;
2280 /* Accumulate the transformed data in the corresponding
2281 checksums. */
2282 md5_process_bytes (buf, sizea, &s[0]);
2283 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2285 /* Update the tiebreaker comparison of the transformed data. */
2286 if (! xfrm_diff)
2288 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2289 if (! xfrm_diff)
2290 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2295 /* Compute and compare the checksums. */
2296 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2297 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2298 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2300 /* Fall back on the tiebreaker if the checksums collide. */
2301 if (! diff)
2303 if (! xfrm_diff)
2305 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2306 if (! xfrm_diff)
2307 xfrm_diff = (lena > lenb) - (lena < lenb);
2310 diff = xfrm_diff;
2313 free (allocated);
2315 return diff;
2318 /* Return the printable width of the block of memory starting at
2319 TEXT and ending just before LIM, counting each tab as one byte.
2320 FIXME: Should we generally be counting non printable chars? */
2322 static size_t
2323 debug_width (char const *text, char const *lim)
2325 size_t width = mbsnwidth (text, lim - text, 0);
2326 while (text < lim)
2327 width += (*text++ == '\t');
2328 return width;
2331 /* For debug mode, "underline" a key at the
2332 specified offset and screen width. */
2334 static void
2335 mark_key (size_t offset, size_t width)
2337 while (offset--)
2338 putchar (' ');
2340 if (!width)
2341 printf (_("^ no match for key\n"));
2342 else
2345 putchar ('_');
2346 while (--width);
2348 putchar ('\n');
2352 /* Return true if KEY is a numeric key. */
2354 static inline bool
2355 key_numeric (struct keyfield const *key)
2357 return key->numeric || key->general_numeric || key->human_numeric;
2360 /* For LINE, output a debugging line that underlines KEY in LINE.
2361 If KEY is null, underline the whole line. */
2363 static void
2364 debug_key (struct line const *line, struct keyfield const *key)
2366 char *text = line->text;
2367 char *beg = text;
2368 char *lim = text + line->length - 1;
2370 if (key)
2372 if (key->sword != SIZE_MAX)
2373 beg = begfield (line, key);
2374 if (key->eword != SIZE_MAX)
2375 lim = limfield (line, key);
2377 if ((key->skipsblanks && key->sword == SIZE_MAX)
2378 || key->month || key_numeric (key))
2380 char saved = *lim;
2381 *lim = '\0';
2383 while (blanks[to_uchar (*beg)])
2384 beg++;
2386 char *tighter_lim = beg;
2388 if (lim < beg)
2389 tighter_lim = lim;
2390 else if (key->month)
2391 getmonth (beg, &tighter_lim);
2392 else if (key->general_numeric)
2393 ignore_value (strtold (beg, &tighter_lim));
2394 else if (key->numeric || key->human_numeric)
2396 char const *p = beg + (beg < lim && *beg == '-');
2397 char max_digit = traverse_raw_number (&p);
2398 if ('0' <= max_digit)
2400 unsigned char ch = *p;
2401 tighter_lim = (char *) p
2402 + (key->human_numeric && unit_order[ch]);
2405 else
2406 tighter_lim = lim;
2408 *lim = saved;
2409 lim = tighter_lim;
2413 size_t offset = debug_width (text, beg);
2414 size_t width = debug_width (beg, lim);
2415 mark_key (offset, width);
2418 /* Debug LINE by underlining its keys. */
2420 static void
2421 debug_line (struct line const *line)
2423 struct keyfield const *key = keylist;
2426 debug_key (line, key);
2427 while (key && ((key = key->next) || ! (unique || stable)));
2430 /* Return whether sorting options specified for key. */
2432 static bool
2433 default_key_compare (struct keyfield const *key)
2435 return ! (key->ignore
2436 || key->translate
2437 || key->skipsblanks
2438 || key->skipeblanks
2439 || key_numeric (key)
2440 || key->month
2441 || key->version
2442 || key->random
2443 /* || key->reverse */
2447 /* Convert a key to the short options used to specify it. */
2449 static void
2450 key_to_opts (struct keyfield const *key, char *opts)
2452 if (key->skipsblanks || key->skipeblanks)
2453 *opts++ = 'b';/* either disables global -b */
2454 if (key->ignore == nondictionary)
2455 *opts++ = 'd';
2456 if (key->translate)
2457 *opts++ = 'f';
2458 if (key->general_numeric)
2459 *opts++ = 'g';
2460 if (key->human_numeric)
2461 *opts++ = 'h';
2462 if (key->ignore == nonprinting)
2463 *opts++ = 'i';
2464 if (key->month)
2465 *opts++ = 'M';
2466 if (key->numeric)
2467 *opts++ = 'n';
2468 if (key->random)
2469 *opts++ = 'R';
2470 if (key->reverse)
2471 *opts++ = 'r';
2472 if (key->version)
2473 *opts++ = 'V';
2474 *opts = '\0';
2477 /* Output data independent key warnings to stderr. */
2479 static void
2480 key_warnings (struct keyfield const *gkey, bool gkey_only)
2482 struct keyfield const *key;
2483 struct keyfield ugkey = *gkey;
2484 unsigned long keynum = 1;
2485 bool basic_numeric_field = false;
2486 bool general_numeric_field = false;
2487 bool basic_numeric_field_span = false;
2488 bool general_numeric_field_span = false;
2490 for (key = keylist; key; key = key->next, keynum++)
2492 if (key_numeric (key))
2494 if (key->general_numeric)
2495 general_numeric_field = true;
2496 else
2497 basic_numeric_field = true;
2500 if (key->traditional_used)
2502 size_t sword = key->sword;
2503 size_t eword = key->eword;
2504 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2505 /* obsolescent syntax +A.x -B.y is equivalent to:
2506 -k A+1.x+1,B.y (when y = 0)
2507 -k A+1.x+1,B+1.y (when y > 0) */
2508 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2509 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2510 char *po = obuf;
2511 char *pn = nbuf;
2513 if (sword == SIZE_MAX)
2514 sword++;
2516 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2517 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2518 if (key->eword != SIZE_MAX)
2520 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2521 stpcpy (stpcpy (pn, ","),
2522 umaxtostr (eword + 1
2523 + (key->echar == SIZE_MAX), tmp));
2525 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2526 quote_n (0, obuf), quote_n (1, nbuf));
2529 /* Warn about field specs that will never match. */
2530 bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword;
2531 if (zero_width)
2532 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2534 /* Warn about significant leading blanks. */
2535 bool implicit_skip = key_numeric (key) || key->month;
2536 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2537 if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
2538 && ((!key->skipsblanks && !implicit_skip)
2539 || (!key->skipsblanks && key->schar)
2540 || (!key->skipeblanks && key->echar)))
2541 error (0, 0, _("leading blanks are significant in key %lu; "
2542 "consider also specifying 'b'"), keynum);
2544 /* Warn about numeric comparisons spanning fields,
2545 as field delimiters could be interpreted as part
2546 of the number (maybe only in other locales). */
2547 if (!gkey_only && key_numeric (key))
2549 size_t sword = key->sword + 1;
2550 size_t eword = key->eword + 1;
2551 if (!sword)
2552 sword++;
2553 if (!eword || sword < eword)
2555 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2556 keynum);
2557 if (key->general_numeric)
2558 general_numeric_field_span = true;
2559 else
2560 basic_numeric_field_span = true;
2564 /* Flag global options not copied or specified in any key. */
2565 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2566 ugkey.ignore = nullptr;
2567 if (ugkey.translate && (ugkey.translate == key->translate))
2568 ugkey.translate = nullptr;
2569 ugkey.skipsblanks &= !key->skipsblanks;
2570 ugkey.skipeblanks &= !key->skipeblanks;
2571 ugkey.month &= !key->month;
2572 ugkey.numeric &= !key->numeric;
2573 ugkey.general_numeric &= !key->general_numeric;
2574 ugkey.human_numeric &= !key->human_numeric;
2575 ugkey.random &= !key->random;
2576 ugkey.version &= !key->version;
2577 ugkey.reverse &= !key->reverse;
2580 /* Explicitly warn if field delimiters in this locale
2581 don't constrain numbers. */
2582 bool number_locale_warned = false;
2583 if (basic_numeric_field_span)
2585 if (tab == TAB_DEFAULT
2586 ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))
2587 : tab == thousands_sep)
2589 error (0, 0,
2590 _("field separator %s is treated as a "
2591 "group separator in numbers"),
2592 quote (((char []) {thousands_sep, 0})));
2593 number_locale_warned = true;
2596 if (basic_numeric_field_span || general_numeric_field_span)
2598 if (tab == TAB_DEFAULT
2599 ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))
2600 : tab == decimal_point)
2602 error (0, 0,
2603 _("field separator %s is treated as a "
2604 "decimal point in numbers"),
2605 quote (((char []) {decimal_point, 0})));
2606 number_locale_warned = true;
2608 else if (tab == '-')
2610 error (0, 0,
2611 _("field separator %s is treated as a "
2612 "minus sign in numbers"),
2613 quote (((char []) {tab, 0})));
2615 else if (general_numeric_field_span && tab == '+')
2617 error (0, 0,
2618 _("field separator %s is treated as a "
2619 "plus sign in numbers"),
2620 quote (((char []) {tab, 0})));
2624 /* Explicitly indicate the decimal point used in this locale,
2625 as it suggests that robust scripts need to consider
2626 setting the locale when comparing numbers. */
2627 if ((basic_numeric_field || general_numeric_field) && ! number_locale_warned)
2629 error (0, 0,
2630 _("%snumbers use %s as a decimal point in this locale"),
2631 tab == decimal_point ? "" : _("note "),
2632 quote (((char []) {decimal_point, 0})));
2636 if (basic_numeric_field && thousands_sep_ignored)
2638 error (0, 0,
2639 _("the multi-byte number group separator "
2640 "in this locale is not supported"));
2643 /* Warn about ignored global options flagged above.
2644 This clears all flags if UGKEY is the only one in the list. */
2645 if (!default_key_compare (&ugkey)
2646 || (ugkey.reverse && (stable || unique) && keylist))
2648 bool ugkey_reverse = ugkey.reverse;
2649 if (!(stable || unique))
2650 ugkey.reverse = false;
2651 /* The following is too big, but guaranteed to be "big enough". */
2652 char opts[sizeof short_options];
2653 key_to_opts (&ugkey, opts);
2654 error (0, 0,
2655 ngettext ("option '-%s' is ignored",
2656 "options '-%s' are ignored",
2657 select_plural (strlen (opts))), opts);
2658 ugkey.reverse = ugkey_reverse;
2660 if (ugkey.reverse && !(stable || unique) && keylist)
2661 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2664 /* Return either the sense of DIFF or its reverse, depending on REVERSED.
2665 If REVERSED, do not simply negate DIFF as that can mishandle INT_MIN. */
2667 static int
2668 diff_reversed (int diff, bool reversed)
2670 return reversed ? (diff < 0) - (diff > 0) : diff;
2673 /* Compare two lines A and B trying every key in sequence until there
2674 are no more keys or a difference is found. */
2676 static int
2677 keycompare (struct line const *a, struct line const *b)
2679 struct keyfield *key = keylist;
2681 /* For the first iteration only, the key positions have been
2682 precomputed for us. */
2683 char *texta = a->keybeg;
2684 char *textb = b->keybeg;
2685 char *lima = a->keylim;
2686 char *limb = b->keylim;
2688 int diff;
2690 while (true)
2692 char const *translate = key->translate;
2693 bool const *ignore = key->ignore;
2695 /* Treat field ends before field starts as empty fields. */
2696 lima = MAX (texta, lima);
2697 limb = MAX (textb, limb);
2699 /* Find the lengths. */
2700 size_t lena = lima - texta;
2701 size_t lenb = limb - textb;
2703 if (hard_LC_COLLATE || key_numeric (key)
2704 || key->month || key->random || key->version)
2706 /* Ordinarily use the keys in-place, temporarily null-terminated. */
2707 char *ta = texta;
2708 char *tb = textb;
2709 size_t tlena = lena;
2710 size_t tlenb = lenb;
2711 char enda = ta[tlena];
2712 char endb = tb[tlenb];
2714 void *allocated = nullptr;
2715 char stackbuf[4000];
2717 if (ignore || translate)
2719 /* Compute with copies of the keys, which are the result of
2720 translating or ignoring characters, and which need their
2721 own storage. */
2723 size_t i;
2725 /* Allocate space for copies. */
2726 size_t size = lena + 1 + lenb + 1;
2727 if (size <= sizeof stackbuf)
2728 ta = stackbuf;
2729 else
2730 ta = allocated = xmalloc (size);
2731 tb = ta + lena + 1;
2733 /* Put into each copy a version of the key in which the
2734 requested characters are ignored or translated. */
2735 for (tlena = i = 0; i < lena; i++)
2736 if (! (ignore && ignore[to_uchar (texta[i])]))
2737 ta[tlena++] = (translate
2738 ? translate[to_uchar (texta[i])]
2739 : texta[i]);
2741 for (tlenb = i = 0; i < lenb; i++)
2742 if (! (ignore && ignore[to_uchar (textb[i])]))
2743 tb[tlenb++] = (translate
2744 ? translate[to_uchar (textb[i])]
2745 : textb[i]);
2748 ta[tlena] = '\0';
2749 tb[tlenb] = '\0';
2751 if (key->numeric)
2752 diff = numcompare (ta, tb);
2753 else if (key->general_numeric)
2754 diff = general_numcompare (ta, tb);
2755 else if (key->human_numeric)
2756 diff = human_numcompare (ta, tb);
2757 else if (key->month)
2758 diff = getmonth (ta, nullptr) - getmonth (tb, nullptr);
2759 else if (key->random)
2760 diff = compare_random (ta, tlena, tb, tlenb);
2761 else if (key->version)
2762 diff = filenvercmp (ta, tlena, tb, tlenb);
2763 else
2765 /* Locale-dependent string sorting. This is slower than
2766 C-locale sorting, which is implemented below. */
2767 if (tlena == 0)
2768 diff = - NONZERO (tlenb);
2769 else if (tlenb == 0)
2770 diff = 1;
2771 else
2772 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2775 ta[tlena] = enda;
2776 tb[tlenb] = endb;
2778 free (allocated);
2780 else if (ignore)
2782 #define CMP_WITH_IGNORE(A, B) \
2783 do \
2785 while (true) \
2787 while (texta < lima && ignore[to_uchar (*texta)]) \
2788 ++texta; \
2789 while (textb < limb && ignore[to_uchar (*textb)]) \
2790 ++textb; \
2791 if (! (texta < lima && textb < limb)) \
2793 diff = (texta < lima) - (textb < limb); \
2794 break; \
2796 diff = to_uchar (A) - to_uchar (B); \
2797 if (diff) \
2798 break; \
2799 ++texta; \
2800 ++textb; \
2804 while (0)
2806 if (translate)
2807 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2808 translate[to_uchar (*textb)]);
2809 else
2810 CMP_WITH_IGNORE (*texta, *textb);
2812 else
2814 size_t lenmin = MIN (lena, lenb);
2815 if (lenmin == 0)
2816 diff = 0;
2817 else if (translate)
2819 size_t i = 0;
2822 diff = (to_uchar (translate[to_uchar (texta[i])])
2823 - to_uchar (translate[to_uchar (textb[i])]));
2824 if (diff)
2825 break;
2826 i++;
2828 while (i < lenmin);
2830 else
2831 diff = memcmp (texta, textb, lenmin);
2833 if (! diff)
2834 diff = (lena > lenb) - (lena < lenb);
2837 if (diff)
2838 break;
2840 key = key->next;
2841 if (! key)
2842 return 0;
2844 /* Find the beginning and limit of the next field. */
2845 if (key->eword != SIZE_MAX)
2846 lima = limfield (a, key), limb = limfield (b, key);
2847 else
2848 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2850 if (key->sword != SIZE_MAX)
2851 texta = begfield (a, key), textb = begfield (b, key);
2852 else
2854 texta = a->text, textb = b->text;
2855 if (key->skipsblanks)
2857 while (texta < lima && blanks[to_uchar (*texta)])
2858 ++texta;
2859 while (textb < limb && blanks[to_uchar (*textb)])
2860 ++textb;
2865 return diff_reversed (diff, key->reverse);
2868 /* Compare two lines A and B, returning negative, zero, or positive
2869 depending on whether A compares less than, equal to, or greater than B. */
2871 static int
2872 compare (struct line const *a, struct line const *b)
2874 int diff;
2875 size_t alen, blen;
2877 /* First try to compare on the specified keys (if any).
2878 The only two cases with no key at all are unadorned sort,
2879 and unadorned sort -r. */
2880 if (keylist)
2882 diff = keycompare (a, b);
2883 if (diff || unique || stable)
2884 return diff;
2887 /* If the keys all compare equal (or no keys were specified)
2888 fall through to the default comparison. */
2889 alen = a->length - 1, blen = b->length - 1;
2891 if (alen == 0)
2892 diff = - NONZERO (blen);
2893 else if (blen == 0)
2894 diff = 1;
2895 else if (hard_LC_COLLATE)
2897 /* xmemcoll0 is a performance enhancement as
2898 it will not unconditionally write '\0' after the
2899 passed in buffers, which was seen to give around
2900 a 3% increase in performance for short lines. */
2901 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2903 else
2905 diff = memcmp (a->text, b->text, MIN (alen, blen));
2906 if (!diff)
2907 diff = (alen > blen) - (alen < blen);
2910 return diff_reversed (diff, reverse);
2913 /* Write LINE to output stream FP; the output file's name is
2914 OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
2915 otherwise. If debugging is enabled and FP is standard output,
2916 append some debugging information. */
2918 static void
2919 write_line (struct line const *line, FILE *fp, char const *output_file)
2921 char *buf = line->text;
2922 size_t n_bytes = line->length;
2923 char *ebuf = buf + n_bytes;
2925 if (!output_file && debug)
2927 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2928 char const *c = buf;
2930 while (c < ebuf)
2932 char wc = *c++;
2933 if (wc == '\t')
2934 wc = '>';
2935 else if (c == ebuf)
2936 wc = '\n';
2937 if (fputc (wc, fp) == EOF)
2938 sort_die (_("write failed"), output_file);
2941 debug_line (line);
2943 else
2945 ebuf[-1] = eolchar;
2946 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2947 sort_die (_("write failed"), output_file);
2948 ebuf[-1] = '\0';
2952 /* Check that the lines read from FILE_NAME come in order. Return
2953 true if they are in order. If CHECKONLY == 'c', also print a
2954 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2955 they are not in order. */
2957 static bool
2958 check (char const *file_name, char checkonly)
2960 FILE *fp = xfopen (file_name, "r");
2961 struct buffer buf; /* Input buffer. */
2962 struct line temp; /* Copy of previous line. */
2963 size_t alloc = 0;
2964 uintmax_t line_number = 0;
2965 struct keyfield const *key = keylist;
2966 bool nonunique = ! unique;
2967 bool ordered = true;
2969 initbuf (&buf, sizeof (struct line),
2970 MAX (merge_buffer_size, sort_size));
2971 temp.text = nullptr;
2973 while (fillbuf (&buf, fp, file_name))
2975 struct line const *line = buffer_linelim (&buf);
2976 struct line const *linebase = line - buf.nlines;
2978 /* Make sure the line saved from the old buffer contents is
2979 less than or equal to the first line of the new buffer. */
2980 if (alloc && nonunique <= compare (&temp, line - 1))
2982 found_disorder:
2984 if (checkonly == 'c')
2986 struct line const *disorder_line = line - 1;
2987 uintmax_t disorder_line_number =
2988 buffer_linelim (&buf) - disorder_line + line_number;
2989 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2990 fprintf (stderr, _("%s: %s:%s: disorder: "),
2991 program_name, file_name,
2992 umaxtostr (disorder_line_number, hr_buf));
2993 write_line (disorder_line, stderr, _("standard error"));
2996 ordered = false;
2997 break;
3001 /* Compare each line in the buffer with its successor. */
3002 while (linebase < --line)
3003 if (nonunique <= compare (line, line - 1))
3004 goto found_disorder;
3006 line_number += buf.nlines;
3008 /* Save the last line of the buffer. */
3009 if (alloc < line->length)
3013 alloc *= 2;
3014 if (! alloc)
3016 alloc = line->length;
3017 break;
3020 while (alloc < line->length);
3022 free (temp.text);
3023 temp.text = xmalloc (alloc);
3025 memcpy (temp.text, line->text, line->length);
3026 temp.length = line->length;
3027 if (key)
3029 temp.keybeg = temp.text + (line->keybeg - line->text);
3030 temp.keylim = temp.text + (line->keylim - line->text);
3034 xfclose (fp, file_name);
3035 free (buf.buf);
3036 free (temp.text);
3037 return ordered;
3040 /* Open FILES (there are NFILES of them) and store the resulting array
3041 of stream pointers into (*PFPS). Allocate the array. Return the
3042 number of successfully opened files, setting errno if this value is
3043 less than NFILES. */
3045 static size_t
3046 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
3048 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
3049 int i;
3051 /* Open as many input files as we can. */
3052 for (i = 0; i < nfiles; i++)
3054 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
3055 ? open_temp (files[i].temp)
3056 : stream_open (files[i].name, "r"));
3057 if (!fps[i])
3058 break;
3061 return i;
3064 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3065 files (all of which are at the start of the FILES array), and
3066 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3067 FPS is the vector of open stream corresponding to the files.
3068 Close input and output streams before returning.
3069 OUTPUT_FILE gives the name of the output file. If it is null,
3070 the output file is standard output. */
3072 static void
3073 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
3074 FILE *ofp, char const *output_file, FILE **fps)
3076 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
3077 /* Input buffers for each file. */
3078 struct line saved; /* Saved line storage for unique check. */
3079 struct line const *savedline = nullptr;
3080 /* &saved if there is a saved line. */
3081 size_t savealloc = 0; /* Size allocated for the saved line. */
3082 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
3083 /* Current line in each line table. */
3084 struct line const **base = xnmalloc (nfiles, sizeof *base);
3085 /* Base of each line table. */
3086 size_t *ord = xnmalloc (nfiles, sizeof *ord);
3087 /* Table representing a permutation of fps,
3088 such that cur[ord[0]] is the smallest line
3089 and will be next output. */
3090 size_t i;
3091 size_t j;
3092 size_t t;
3093 struct keyfield const *key = keylist;
3094 saved.text = nullptr;
3096 /* Read initial lines from each input file. */
3097 for (i = 0; i < nfiles; )
3099 initbuf (&buffer[i], sizeof (struct line),
3100 MAX (merge_buffer_size, sort_size / nfiles));
3101 if (fillbuf (&buffer[i], fps[i], files[i].name))
3103 struct line const *linelim = buffer_linelim (&buffer[i]);
3104 cur[i] = linelim - 1;
3105 base[i] = linelim - buffer[i].nlines;
3106 i++;
3108 else
3110 /* fps[i] is empty; eliminate it from future consideration. */
3111 xfclose (fps[i], files[i].name);
3112 if (i < ntemps)
3114 ntemps--;
3115 zaptemp (files[i].name);
3117 free (buffer[i].buf);
3118 --nfiles;
3119 for (j = i; j < nfiles; ++j)
3121 files[j] = files[j + 1];
3122 fps[j] = fps[j + 1];
3127 /* Set up the ord table according to comparisons among input lines.
3128 Since this only reorders two items if one is strictly greater than
3129 the other, it is stable. */
3130 for (i = 0; i < nfiles; ++i)
3131 ord[i] = i;
3132 for (i = 1; i < nfiles; ++i)
3133 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
3134 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
3136 /* Repeatedly output the smallest line until no input remains. */
3137 while (nfiles)
3139 struct line const *smallest = cur[ord[0]];
3141 /* If uniquified output is turned on, output only the first of
3142 an identical series of lines. */
3143 if (unique)
3145 if (savedline && compare (savedline, smallest))
3147 savedline = nullptr;
3148 write_line (&saved, ofp, output_file);
3150 if (!savedline)
3152 savedline = &saved;
3153 if (savealloc < smallest->length)
3156 if (! savealloc)
3158 savealloc = smallest->length;
3159 break;
3161 while ((savealloc *= 2) < smallest->length);
3163 free (saved.text);
3164 saved.text = xmalloc (savealloc);
3166 saved.length = smallest->length;
3167 memcpy (saved.text, smallest->text, saved.length);
3168 if (key)
3170 saved.keybeg =
3171 saved.text + (smallest->keybeg - smallest->text);
3172 saved.keylim =
3173 saved.text + (smallest->keylim - smallest->text);
3177 else
3178 write_line (smallest, ofp, output_file);
3180 /* Check if we need to read more lines into memory. */
3181 if (base[ord[0]] < smallest)
3182 cur[ord[0]] = smallest - 1;
3183 else
3185 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
3187 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
3188 cur[ord[0]] = linelim - 1;
3189 base[ord[0]] = linelim - buffer[ord[0]].nlines;
3191 else
3193 /* We reached EOF on fps[ord[0]]. */
3194 for (i = 1; i < nfiles; ++i)
3195 if (ord[i] > ord[0])
3196 --ord[i];
3197 --nfiles;
3198 xfclose (fps[ord[0]], files[ord[0]].name);
3199 if (ord[0] < ntemps)
3201 ntemps--;
3202 zaptemp (files[ord[0]].name);
3204 free (buffer[ord[0]].buf);
3205 for (i = ord[0]; i < nfiles; ++i)
3207 fps[i] = fps[i + 1];
3208 files[i] = files[i + 1];
3209 buffer[i] = buffer[i + 1];
3210 cur[i] = cur[i + 1];
3211 base[i] = base[i + 1];
3213 for (i = 0; i < nfiles; ++i)
3214 ord[i] = ord[i + 1];
3215 continue;
3219 /* The new line just read in may be larger than other lines
3220 already in main memory; push it back in the queue until we
3221 encounter a line larger than it. Optimize for the common
3222 case where the new line is smallest. */
3224 size_t lo = 1;
3225 size_t hi = nfiles;
3226 size_t probe = lo;
3227 size_t ord0 = ord[0];
3228 size_t count_of_smaller_lines;
3230 while (lo < hi)
3232 int cmp = compare (cur[ord0], cur[ord[probe]]);
3233 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3234 hi = probe;
3235 else
3236 lo = probe + 1;
3237 probe = (lo + hi) / 2;
3240 count_of_smaller_lines = lo - 1;
3241 for (j = 0; j < count_of_smaller_lines; j++)
3242 ord[j] = ord[j + 1];
3243 ord[count_of_smaller_lines] = ord0;
3247 if (unique && savedline)
3249 write_line (&saved, ofp, output_file);
3250 free (saved.text);
3253 xfclose (ofp, output_file);
3254 free (fps);
3255 free (buffer);
3256 free (ord);
3257 free (base);
3258 free (cur);
3261 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3262 files (all of which are at the start of the FILES array), and
3263 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3264 Close input and output files before returning.
3265 OUTPUT_FILE gives the name of the output file.
3267 Return the number of files successfully merged. This number can be
3268 less than NFILES if we ran low on file descriptors, but in this
3269 case it is never less than 2. */
3271 static size_t
3272 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3273 FILE *ofp, char const *output_file)
3275 FILE **fps;
3276 size_t nopened = open_input_files (files, nfiles, &fps);
3277 if (nopened < nfiles && nopened < 2)
3278 sort_die (_("open failed"), files[nopened].name);
3279 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3280 return nopened;
3283 /* Merge into T (of size NLINES) the two sorted arrays of lines
3284 LO (with NLINES / 2 members), and
3285 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3286 T and LO point just past their respective arrays, and the arrays
3287 are in reverse order. NLINES must be at least 2. */
3289 static void
3290 mergelines (struct line *restrict t, size_t nlines,
3291 struct line const *restrict lo)
3293 size_t nlo = nlines / 2;
3294 size_t nhi = nlines - nlo;
3295 struct line *hi = t - nlo;
3297 while (true)
3298 if (compare (lo - 1, hi - 1) <= 0)
3300 *--t = *--lo;
3301 if (! --nlo)
3303 /* HI must equal T now, and there is no need to copy from
3304 HI to T. */
3305 return;
3308 else
3310 *--t = *--hi;
3311 if (! --nhi)
3314 *--t = *--lo;
3315 while (--nlo);
3317 return;
3322 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3323 Do this all within one thread. NLINES must be at least 2.
3324 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3325 Otherwise the sort is in-place and TEMP is half-sized.
3326 The input and output arrays are in reverse order, and LINES and
3327 TEMP point just past the end of their respective arrays.
3329 Use a recursive divide-and-conquer algorithm, in the style
3330 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3331 the optimization suggested by exercise 5.2.4-10; this requires room
3332 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3333 writes that this memory optimization was originally published by
3334 D. A. Bell, Comp J. 1 (1958), 75. */
3336 static void
3337 sequential_sort (struct line *restrict lines, size_t nlines,
3338 struct line *restrict temp, bool to_temp)
3340 if (nlines == 2)
3342 /* Declare 'swap' as int, not bool, to work around a bug
3343 <https://lists.gnu.org/r/bug-coreutils/2005-10/msg00086.html>
3344 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3345 int swap = (0 < compare (&lines[-1], &lines[-2]));
3346 if (to_temp)
3348 temp[-1] = lines[-1 - swap];
3349 temp[-2] = lines[-2 + swap];
3351 else if (swap)
3353 temp[-1] = lines[-1];
3354 lines[-1] = lines[-2];
3355 lines[-2] = temp[-1];
3358 else
3360 size_t nlo = nlines / 2;
3361 size_t nhi = nlines - nlo;
3362 struct line *lo = lines;
3363 struct line *hi = lines - nlo;
3365 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3366 if (1 < nlo)
3367 sequential_sort (lo, nlo, temp, !to_temp);
3368 else if (!to_temp)
3369 temp[-1] = lo[-1];
3371 struct line *dest;
3372 struct line const *sorted_lo;
3373 if (to_temp)
3375 dest = temp;
3376 sorted_lo = lines;
3378 else
3380 dest = lines;
3381 sorted_lo = temp;
3383 mergelines (dest, nlines, sorted_lo);
3387 static struct merge_node *init_node (struct merge_node *restrict,
3388 struct merge_node *restrict,
3389 struct line *, size_t, size_t, bool);
3392 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3393 lines, with destination DEST. */
3394 static struct merge_node *
3395 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3397 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3399 struct merge_node *root = merge_tree;
3400 root->lo = root->hi = root->end_lo = root->end_hi = nullptr;
3401 root->dest = nullptr;
3402 root->nlo = root->nhi = nlines;
3403 root->parent = nullptr;
3404 root->level = MERGE_END;
3405 root->queued = false;
3406 pthread_mutex_init (&root->lock, nullptr);
3408 init_node (root, root + 1, dest, nthreads, nlines, false);
3409 return merge_tree;
3412 /* Destroy the merge tree. */
3413 static void
3414 merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
3416 size_t n_nodes = nthreads * 2;
3417 struct merge_node *node = merge_tree;
3419 while (n_nodes--)
3421 pthread_mutex_destroy (&node->lock);
3422 node++;
3425 free (merge_tree);
3428 /* Initialize a merge tree node and its descendants. The node's
3429 parent is PARENT. The node and its descendants are taken from the
3430 array of nodes NODE_POOL. Their destination starts at DEST; they
3431 will consume NTHREADS threads. The total number of sort lines is
3432 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3433 its parent. */
3435 static struct merge_node *
3436 init_node (struct merge_node *restrict parent,
3437 struct merge_node *restrict node_pool,
3438 struct line *dest, size_t nthreads,
3439 size_t total_lines, bool is_lo_child)
3441 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3442 size_t nlo = nlines / 2;
3443 size_t nhi = nlines - nlo;
3444 struct line *lo = dest - total_lines;
3445 struct line *hi = lo - nlo;
3446 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3448 struct merge_node *node = node_pool++;
3449 node->lo = node->end_lo = lo;
3450 node->hi = node->end_hi = hi;
3451 node->dest = parent_end;
3452 node->nlo = nlo;
3453 node->nhi = nhi;
3454 node->parent = parent;
3455 node->level = parent->level + 1;
3456 node->queued = false;
3457 pthread_mutex_init (&node->lock, nullptr);
3459 if (nthreads > 1)
3461 size_t lo_threads = nthreads / 2;
3462 size_t hi_threads = nthreads - lo_threads;
3463 node->lo_child = node_pool;
3464 node_pool = init_node (node, node_pool, lo, lo_threads,
3465 total_lines, true);
3466 node->hi_child = node_pool;
3467 node_pool = init_node (node, node_pool, hi, hi_threads,
3468 total_lines, false);
3470 else
3472 node->lo_child = nullptr;
3473 node->hi_child = nullptr;
3475 return node_pool;
3479 /* Compare two merge nodes A and B for priority. */
3481 static int
3482 compare_nodes (void const *a, void const *b)
3484 struct merge_node const *nodea = a;
3485 struct merge_node const *nodeb = b;
3486 if (nodea->level == nodeb->level)
3487 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3488 return nodea->level < nodeb->level;
3491 /* Lock a merge tree NODE. */
3493 static inline void
3494 lock_node (struct merge_node *node)
3496 pthread_mutex_lock (&node->lock);
3499 /* Unlock a merge tree NODE. */
3501 static inline void
3502 unlock_node (struct merge_node *node)
3504 pthread_mutex_unlock (&node->lock);
3507 /* Destroy merge QUEUE. */
3509 static void
3510 queue_destroy (struct merge_node_queue *queue)
3512 heap_free (queue->priority_queue);
3513 pthread_cond_destroy (&queue->cond);
3514 pthread_mutex_destroy (&queue->mutex);
3517 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3518 NTHREADS threads. */
3520 static void
3521 queue_init (struct merge_node_queue *queue, size_t nthreads)
3523 /* Though it's highly unlikely all nodes are in the heap at the same
3524 time, the heap should accommodate all of them. Counting a null
3525 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3526 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3527 pthread_mutex_init (&queue->mutex, nullptr);
3528 pthread_cond_init (&queue->cond, nullptr);
3531 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3532 does not need to lock NODE. */
3534 static void
3535 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3537 pthread_mutex_lock (&queue->mutex);
3538 heap_insert (queue->priority_queue, node);
3539 node->queued = true;
3540 pthread_cond_signal (&queue->cond);
3541 pthread_mutex_unlock (&queue->mutex);
3544 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3546 static struct merge_node *
3547 queue_pop (struct merge_node_queue *queue)
3549 struct merge_node *node;
3550 pthread_mutex_lock (&queue->mutex);
3551 while (! (node = heap_remove_top (queue->priority_queue)))
3552 pthread_cond_wait (&queue->cond, &queue->mutex);
3553 pthread_mutex_unlock (&queue->mutex);
3554 lock_node (node);
3555 node->queued = false;
3556 return node;
3559 /* Output LINE to TFP, unless -u is specified and the line compares
3560 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3561 is null if TFP is standard output.
3563 This function does not save the line for comparison later, so it is
3564 appropriate only for internal sort. */
3566 static void
3567 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3569 if (unique)
3571 if (saved_line.text && ! compare (line, &saved_line))
3572 return;
3573 saved_line = *line;
3576 write_line (line, tfp, temp_output);
3579 /* Merge the lines currently available to a NODE in the binary
3580 merge tree. Merge a number of lines appropriate for this merge
3581 level, assuming TOTAL_LINES is the total number of lines.
3583 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3584 the name of TFP, or is null if TFP is standard output. */
3586 static void
3587 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3588 FILE *tfp, char const *temp_output)
3590 struct line *lo_orig = node->lo;
3591 struct line *hi_orig = node->hi;
3592 size_t to_merge = MAX_MERGE (total_lines, node->level);
3593 size_t merged_lo;
3594 size_t merged_hi;
3596 if (node->level > MERGE_ROOT)
3598 /* Merge to destination buffer. */
3599 struct line *dest = *node->dest;
3600 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3601 if (compare (node->lo - 1, node->hi - 1) <= 0)
3602 *--dest = *--node->lo;
3603 else
3604 *--dest = *--node->hi;
3606 merged_lo = lo_orig - node->lo;
3607 merged_hi = hi_orig - node->hi;
3609 if (node->nhi == merged_hi)
3610 while (node->lo != node->end_lo && to_merge--)
3611 *--dest = *--node->lo;
3612 else if (node->nlo == merged_lo)
3613 while (node->hi != node->end_hi && to_merge--)
3614 *--dest = *--node->hi;
3615 *node->dest = dest;
3617 else
3619 /* Merge directly to output. */
3620 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3622 if (compare (node->lo - 1, node->hi - 1) <= 0)
3623 write_unique (--node->lo, tfp, temp_output);
3624 else
3625 write_unique (--node->hi, tfp, temp_output);
3628 merged_lo = lo_orig - node->lo;
3629 merged_hi = hi_orig - node->hi;
3631 if (node->nhi == merged_hi)
3633 while (node->lo != node->end_lo && to_merge--)
3634 write_unique (--node->lo, tfp, temp_output);
3636 else if (node->nlo == merged_lo)
3638 while (node->hi != node->end_hi && to_merge--)
3639 write_unique (--node->hi, tfp, temp_output);
3643 /* Update NODE. */
3644 merged_lo = lo_orig - node->lo;
3645 merged_hi = hi_orig - node->hi;
3646 node->nlo -= merged_lo;
3647 node->nhi -= merged_hi;
3650 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3651 NODE's children has available lines and the other either has
3652 available lines or has exhausted its lines. */
3654 static void
3655 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3657 if (! node->queued)
3659 bool lo_avail = (node->lo - node->end_lo) != 0;
3660 bool hi_avail = (node->hi - node->end_hi) != 0;
3661 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3662 queue_insert (queue, node);
3666 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3668 static void
3669 queue_check_insert_parent (struct merge_node_queue *queue,
3670 struct merge_node *node)
3672 if (node->level > MERGE_ROOT)
3674 lock_node (node->parent);
3675 queue_check_insert (queue, node->parent);
3676 unlock_node (node->parent);
3678 else if (node->nlo + node->nhi == 0)
3680 /* If the MERGE_ROOT NODE has finished merging, insert the
3681 MERGE_END node. */
3682 queue_insert (queue, node->parent);
3686 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3687 some of those lines, until the MERGE_END node is popped.
3688 TOTAL_LINES is the total number of lines. If merging at the top
3689 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3690 null if TFP is standard output. */
3692 static void
3693 merge_loop (struct merge_node_queue *queue,
3694 size_t total_lines, FILE *tfp, char const *temp_output)
3696 while (true)
3698 struct merge_node *node = queue_pop (queue);
3700 if (node->level == MERGE_END)
3702 unlock_node (node);
3703 /* Reinsert so other threads can pop it. */
3704 queue_insert (queue, node);
3705 break;
3707 mergelines_node (node, total_lines, tfp, temp_output);
3708 queue_check_insert (queue, node);
3709 queue_check_insert_parent (queue, node);
3711 unlock_node (node);
3716 static void sortlines (struct line *restrict, size_t, size_t,
3717 struct merge_node *, struct merge_node_queue *,
3718 FILE *, char const *);
3720 /* Thread arguments for sortlines_thread. */
3722 struct thread_args
3724 /* Source, i.e., the array of lines to sort. This points just past
3725 the end of the array. */
3726 struct line *lines;
3728 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3729 size_t nthreads;
3731 /* Number of lines in LINES and DEST. */
3732 size_t const total_lines;
3734 /* Merge node. Lines from this node and this node's sibling will merged
3735 to this node's parent. */
3736 struct merge_node *const node;
3738 /* The priority queue controlling available work for the entire
3739 internal sort. */
3740 struct merge_node_queue *const queue;
3742 /* If at the top level, the file to output to, and the file's name.
3743 If the file is standard output, the file's name is null. */
3744 FILE *tfp;
3745 char const *output_temp;
3748 /* Like sortlines, except with a signature acceptable to pthread_create. */
3750 static void *
3751 sortlines_thread (void *data)
3753 struct thread_args const *args = data;
3754 sortlines (args->lines, args->nthreads, args->total_lines,
3755 args->node, args->queue, args->tfp,
3756 args->output_temp);
3757 return nullptr;
3760 /* Sort lines, possibly in parallel. The arguments are as in struct
3761 thread_args above.
3763 The algorithm has three phases: node creation, sequential sort,
3764 and binary merge.
3766 During node creation, sortlines recursively visits each node in the
3767 binary merge tree and creates a NODE structure corresponding to all the
3768 future line merging NODE is responsible for. For each call to
3769 sortlines, half the available threads are assigned to each recursive
3770 call, until a leaf node having only 1 available thread is reached.
3772 Each leaf node then performs two sequential sorts, one on each half of
3773 the lines it is responsible for. It records in its NODE structure that
3774 there are two sorted sublists available to merge from, and inserts its
3775 NODE into the priority queue.
3777 The binary merge phase then begins. Each thread drops into a loop
3778 where the thread retrieves a NODE from the priority queue, merges lines
3779 available to that NODE, and potentially insert NODE or its parent back
3780 into the queue if there are sufficient available lines for them to
3781 merge. This continues until all lines at all nodes of the merge tree
3782 have been merged. */
3784 static void
3785 sortlines (struct line *restrict lines, size_t nthreads,
3786 size_t total_lines, struct merge_node *node,
3787 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3789 size_t nlines = node->nlo + node->nhi;
3791 /* Calculate thread arguments. */
3792 size_t lo_threads = nthreads / 2;
3793 size_t hi_threads = nthreads - lo_threads;
3794 pthread_t thread;
3795 struct thread_args args = {lines, lo_threads, total_lines,
3796 node->lo_child, queue, tfp, temp_output};
3798 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3799 && pthread_create (&thread, nullptr, sortlines_thread, &args) == 0)
3801 sortlines (lines - node->nlo, hi_threads, total_lines,
3802 node->hi_child, queue, tfp, temp_output);
3803 pthread_join (thread, nullptr);
3805 else
3807 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3808 Sort with 1 thread. */
3809 size_t nlo = node->nlo;
3810 size_t nhi = node->nhi;
3811 struct line *temp = lines - total_lines;
3812 if (1 < nhi)
3813 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3814 if (1 < nlo)
3815 sequential_sort (lines, nlo, temp, false);
3817 /* Update merge NODE. No need to lock yet. */
3818 node->lo = lines;
3819 node->hi = lines - nlo;
3820 node->end_lo = lines - nlo;
3821 node->end_hi = lines - nlo - nhi;
3823 queue_insert (queue, node);
3824 merge_loop (queue, total_lines, tfp, temp_output);
3828 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3829 the same as OUTFILE. If found, replace each with the same
3830 temporary copy that can be merged into OUTFILE without destroying
3831 OUTFILE before it is completely read. This temporary copy does not
3832 count as a merge temp, so don't worry about incrementing NTEMPS in
3833 the caller; final cleanup will remove it, not zaptemp.
3835 This test ensures that an otherwise-erroneous use like
3836 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3837 It's not clear that POSIX requires this nicety.
3838 Detect common error cases, but don't try to catch obscure cases like
3839 "cat ... FILE ... | sort -m -o FILE"
3840 where traditional "sort" doesn't copy the input and where
3841 people should know that they're getting into trouble anyway.
3842 Catching these obscure cases would slow down performance in
3843 common cases. */
3845 static void
3846 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3847 size_t nfiles, char const *outfile)
3849 struct tempnode *tempcopy = nullptr;
3851 for (size_t i = ntemps; i < nfiles; i++)
3853 bool is_stdin = STREQ (files[i].name, "-");
3854 bool same;
3855 struct stat instat;
3857 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3858 same = true;
3859 else
3861 struct stat *outst = get_outstatus ();
3862 if (!outst)
3863 break;
3865 same = (((is_stdin
3866 ? fstat (STDIN_FILENO, &instat)
3867 : stat (files[i].name, &instat))
3868 == 0)
3869 && psame_inode (&instat, outst));
3872 if (same)
3874 if (! tempcopy)
3876 FILE *tftp;
3877 tempcopy = create_temp (&tftp);
3878 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3881 files[i].name = tempcopy->name;
3882 files[i].temp = tempcopy;
3887 /* Scan the input files to ensure all are accessible.
3888 Otherwise exit with a diagnostic.
3890 This will catch common issues with permissions etc.
3891 but will fail to notice issues where you can open but not read,
3892 like when a directory is specified on some systems.
3893 Catching these obscure cases could slow down performance in
3894 common cases. */
3896 static void
3897 check_inputs (char *const *files, size_t nfiles)
3899 for (size_t i = 0; i < nfiles; i++)
3901 if (STREQ (files[i], "-"))
3902 continue;
3904 if (euidaccess (files[i], R_OK) != 0)
3905 sort_die (_("cannot read"), files[i]);
3909 /* Ensure a specified output file can be created or written to,
3910 and point stdout to it. Do not truncate the file.
3911 Exit with a diagnostic on failure. */
3913 static void
3914 check_output (char const *outfile)
3916 if (outfile)
3918 int oflags = O_WRONLY | O_BINARY | O_CLOEXEC | O_CREAT;
3919 int outfd = open (outfile, oflags, MODE_RW_UGO);
3920 if (outfd < 0)
3921 sort_die (_("open failed"), outfile);
3922 move_fd (outfd, STDOUT_FILENO);
3926 /* Merge the input FILES. NTEMPS is the number of files at the
3927 start of FILES that are temporary; it is zero at the top level.
3928 NFILES is the total number of files. Put the output in
3929 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3931 static void
3932 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3933 char const *output_file)
3935 while (nmerge < nfiles)
3937 /* Number of input files processed so far. */
3938 size_t in;
3940 /* Number of output files generated so far. */
3941 size_t out;
3943 /* nfiles % NMERGE; this counts input files that are left over
3944 after all full-sized merges have been done. */
3945 size_t remainder;
3947 /* Number of easily-available slots at the next loop iteration. */
3948 size_t cheap_slots;
3950 /* Do as many NMERGE-size merges as possible. In the case that
3951 nmerge is bogus, increment by the maximum number of file
3952 descriptors allowed. */
3953 for (out = in = 0; nmerge <= nfiles - in; out++)
3955 FILE *tfp;
3956 struct tempnode *temp = create_temp (&tfp);
3957 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3958 nmerge, tfp, temp->name);
3959 ntemps -= MIN (ntemps, num_merged);
3960 files[out].name = temp->name;
3961 files[out].temp = temp;
3962 in += num_merged;
3965 remainder = nfiles - in;
3966 cheap_slots = nmerge - out % nmerge;
3968 if (cheap_slots < remainder)
3970 /* So many files remain that they can't all be put into the last
3971 NMERGE-sized output window. Do one more merge. Merge as few
3972 files as possible, to avoid needless I/O. */
3973 size_t nshortmerge = remainder - cheap_slots + 1;
3974 FILE *tfp;
3975 struct tempnode *temp = create_temp (&tfp);
3976 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3977 nshortmerge, tfp, temp->name);
3978 ntemps -= MIN (ntemps, num_merged);
3979 files[out].name = temp->name;
3980 files[out++].temp = temp;
3981 in += num_merged;
3984 /* Put the remaining input files into the last NMERGE-sized output
3985 window, so they will be merged in the next pass. */
3986 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3987 ntemps += out;
3988 nfiles -= in - out;
3991 avoid_trashing_input (files, ntemps, nfiles, output_file);
3993 /* We aren't guaranteed that this final mergefiles will work, therefore we
3994 try to merge into the output, and then merge as much as we can into a
3995 temp file if we can't. Repeat. */
3997 while (true)
3999 /* Merge directly into the output file if possible. */
4000 FILE **fps;
4001 size_t nopened = open_input_files (files, nfiles, &fps);
4003 if (nopened == nfiles)
4005 FILE *ofp = stream_open (output_file, "w");
4006 if (ofp)
4008 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
4009 break;
4011 if (errno != EMFILE || nopened <= 2)
4012 sort_die (_("open failed"), output_file);
4014 else if (nopened <= 2)
4015 sort_die (_("open failed"), files[nopened].name);
4017 /* We ran out of file descriptors. Close one of the input
4018 files, to gain a file descriptor. Then create a temporary
4019 file with our spare file descriptor. Retry if that failed
4020 (e.g., some other process could open a file between the time
4021 we closed and tried to create). */
4022 FILE *tfp;
4023 struct tempnode *temp;
4026 nopened--;
4027 xfclose (fps[nopened], files[nopened].name);
4028 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
4030 while (!temp);
4032 /* Merge into the newly allocated temporary. */
4033 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
4034 fps);
4035 ntemps -= MIN (ntemps, nopened);
4036 files[0].name = temp->name;
4037 files[0].temp = temp;
4039 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
4040 ntemps++;
4041 nfiles -= nopened - 1;
4045 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
4047 static void
4048 sort (char *const *files, size_t nfiles, char const *output_file,
4049 size_t nthreads)
4051 struct buffer buf;
4052 size_t ntemps = 0;
4053 bool output_file_created = false;
4055 buf.alloc = 0;
4057 while (nfiles)
4059 char const *temp_output;
4060 char const *file = *files;
4061 FILE *fp = xfopen (file, "r");
4062 FILE *tfp;
4064 size_t bytes_per_line;
4065 if (nthreads > 1)
4067 /* Get log P. */
4068 size_t tmp = 1;
4069 size_t mult = 1;
4070 while (tmp < nthreads)
4072 tmp *= 2;
4073 mult++;
4075 bytes_per_line = (mult * sizeof (struct line));
4077 else
4078 bytes_per_line = sizeof (struct line) * 3 / 2;
4080 if (! buf.alloc)
4081 initbuf (&buf, bytes_per_line,
4082 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
4083 buf.eof = false;
4084 files++;
4085 nfiles--;
4087 while (fillbuf (&buf, fp, file))
4089 struct line *line;
4091 if (buf.eof && nfiles
4092 && (bytes_per_line + 1
4093 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
4095 /* End of file, but there is more input and buffer room.
4096 Concatenate the next input file; this is faster in
4097 the usual case. */
4098 buf.left = buf.used;
4099 break;
4102 saved_line.text = nullptr;
4103 line = buffer_linelim (&buf);
4104 if (buf.eof && !nfiles && !ntemps && !buf.left)
4106 xfclose (fp, file);
4107 tfp = xfopen (output_file, "w");
4108 temp_output = output_file;
4109 output_file_created = true;
4111 else
4113 ++ntemps;
4114 temp_output = create_temp (&tfp)->name;
4116 if (1 < buf.nlines)
4118 struct merge_node_queue queue;
4119 queue_init (&queue, nthreads);
4120 struct merge_node *merge_tree =
4121 merge_tree_init (nthreads, buf.nlines, line);
4123 sortlines (line, nthreads, buf.nlines, merge_tree + 1,
4124 &queue, tfp, temp_output);
4126 merge_tree_destroy (nthreads, merge_tree);
4127 queue_destroy (&queue);
4129 else
4130 write_unique (line - 1, tfp, temp_output);
4132 xfclose (tfp, temp_output);
4134 if (output_file_created)
4135 goto finish;
4137 xfclose (fp, file);
4140 finish:
4141 free (buf.buf);
4143 if (! output_file_created)
4145 struct tempnode *node = temphead;
4146 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
4147 for (size_t i = 0; node; i++)
4149 tempfiles[i].name = node->name;
4150 tempfiles[i].temp = node;
4151 node = node->next;
4153 merge (tempfiles, ntemps, ntemps, output_file);
4154 free (tempfiles);
4157 reap_all ();
4160 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
4162 static void
4163 insertkey (struct keyfield *key_arg)
4165 struct keyfield **p;
4166 struct keyfield *key = xmemdup (key_arg, sizeof *key);
4168 for (p = &keylist; *p; p = &(*p)->next)
4169 continue;
4170 *p = key;
4171 key->next = nullptr;
4174 /* Report a bad field specification SPEC, with extra info MSGID. */
4176 static void
4177 badfieldspec (char const *spec, char const *msgid)
4179 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
4180 _(msgid), quote (spec));
4183 /* Report incompatible options. */
4185 static void
4186 incompatible_options (char const *opts)
4188 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
4191 /* Check compatibility of ordering options. */
4193 static void
4194 check_ordering_compatibility (void)
4196 struct keyfield *key;
4198 for (key = keylist; key; key = key->next)
4199 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4200 + key->month + (key->version | key->random | !!key->ignore)))
4202 /* The following is too big, but guaranteed to be "big enough". */
4203 char opts[sizeof short_options];
4204 /* Clear flags we're not interested in. */
4205 key->skipsblanks = key->skipeblanks = key->reverse = false;
4206 key_to_opts (key, opts);
4207 incompatible_options (opts);
4211 /* Parse the leading integer in STRING and store the resulting value
4212 (which must fit into size_t) into *VAL. Return the address of the
4213 suffix after the integer. If the value is too large, silently
4214 substitute SIZE_MAX. If MSGID is null, return nullptr after
4215 failure; otherwise, report MSGID and exit on failure. */
4217 static char const *
4218 parse_field_count (char const *string, size_t *val, char const *msgid)
4220 char *suffix;
4221 uintmax_t n;
4223 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4225 case LONGINT_OK:
4226 case LONGINT_INVALID_SUFFIX_CHAR:
4227 *val = n;
4228 if (*val == n)
4229 break;
4230 FALLTHROUGH;
4231 case LONGINT_OVERFLOW:
4232 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4233 *val = SIZE_MAX;
4234 break;
4236 case LONGINT_INVALID:
4237 if (msgid)
4238 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4239 _(msgid), quote (string));
4240 return nullptr;
4243 return suffix;
4246 /* Handle interrupts and hangups. */
4248 static void
4249 sighandler (int sig)
4251 if (! SA_NOCLDSTOP)
4252 signal (sig, SIG_IGN);
4254 cleanup ();
4256 signal (sig, SIG_DFL);
4257 raise (sig);
4260 /* Set the ordering options for KEY specified in S.
4261 Return the address of the first character in S that
4262 is not a valid ordering option.
4263 BLANKTYPE is the kind of blanks that 'b' should skip. */
4265 static char *
4266 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4268 while (*s)
4270 switch (*s)
4272 case 'b':
4273 if (blanktype == bl_start || blanktype == bl_both)
4274 key->skipsblanks = true;
4275 if (blanktype == bl_end || blanktype == bl_both)
4276 key->skipeblanks = true;
4277 break;
4278 case 'd':
4279 key->ignore = nondictionary;
4280 break;
4281 case 'f':
4282 key->translate = fold_toupper;
4283 break;
4284 case 'g':
4285 key->general_numeric = true;
4286 break;
4287 case 'h':
4288 key->human_numeric = true;
4289 break;
4290 case 'i':
4291 /* Option order should not matter, so don't let -i override
4292 -d. -d implies -i, but -i does not imply -d. */
4293 if (! key->ignore)
4294 key->ignore = nonprinting;
4295 break;
4296 case 'M':
4297 key->month = true;
4298 break;
4299 case 'n':
4300 key->numeric = true;
4301 break;
4302 case 'R':
4303 key->random = true;
4304 break;
4305 case 'r':
4306 key->reverse = true;
4307 break;
4308 case 'V':
4309 key->version = true;
4310 break;
4311 default:
4312 return (char *) s;
4314 ++s;
4316 return (char *) s;
4319 /* Initialize KEY. */
4321 static struct keyfield *
4322 key_init (struct keyfield *key)
4324 memset (key, 0, sizeof *key);
4325 key->eword = SIZE_MAX;
4326 return key;
4330 main (int argc, char **argv)
4332 struct keyfield *key;
4333 struct keyfield key_buf;
4334 struct keyfield gkey;
4335 bool gkey_only = false;
4336 char const *s;
4337 int c = 0;
4338 char checkonly = 0;
4339 bool mergeonly = false;
4340 char *random_source = nullptr;
4341 bool need_random = false;
4342 size_t nthreads = 0;
4343 size_t nfiles = 0;
4344 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
4345 int posix_ver = posix2_version ();
4346 bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809);
4347 char **files;
4348 char *files_from = nullptr;
4349 struct Tokens tok;
4350 char const *outfile = nullptr;
4351 bool locale_ok;
4353 initialize_main (&argc, &argv);
4354 set_program_name (argv[0]);
4355 locale_ok = !! setlocale (LC_ALL, "");
4356 bindtextdomain (PACKAGE, LOCALEDIR);
4357 textdomain (PACKAGE);
4359 initialize_exit_failure (SORT_FAILURE);
4361 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4362 #if HAVE_NL_LANGINFO
4363 hard_LC_TIME = hard_locale (LC_TIME);
4364 #endif
4366 /* Get locale's representation of the decimal point. */
4368 struct lconv const *locale = localeconv ();
4370 /* If the locale doesn't define a decimal point, or if the decimal
4371 point is multibyte, use the C locale's decimal point. FIXME:
4372 add support for multibyte decimal points. */
4373 decimal_point = locale->decimal_point[0];
4374 if (! decimal_point || locale->decimal_point[1])
4375 decimal_point = '.';
4377 /* FIXME: add support for multibyte thousands separators. */
4378 thousands_sep = locale->thousands_sep[0];
4379 if (thousands_sep && locale->thousands_sep[1])
4380 thousands_sep_ignored = true;
4381 if (! thousands_sep || locale->thousands_sep[1])
4382 thousands_sep = NON_CHAR;
4385 have_read_stdin = false;
4386 inittables ();
4389 size_t i;
4390 static int const sig[] =
4392 /* The usual suspects. */
4393 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4394 #ifdef SIGPOLL
4395 SIGPOLL,
4396 #endif
4397 #ifdef SIGPROF
4398 SIGPROF,
4399 #endif
4400 #ifdef SIGVTALRM
4401 SIGVTALRM,
4402 #endif
4403 #ifdef SIGXCPU
4404 SIGXCPU,
4405 #endif
4406 #ifdef SIGXFSZ
4407 SIGXFSZ,
4408 #endif
4410 enum { nsigs = ARRAY_CARDINALITY (sig) };
4412 #if SA_NOCLDSTOP
4413 struct sigaction act;
4415 sigemptyset (&caught_signals);
4416 for (i = 0; i < nsigs; i++)
4418 sigaction (sig[i], nullptr, &act);
4419 if (act.sa_handler != SIG_IGN)
4420 sigaddset (&caught_signals, sig[i]);
4423 act.sa_handler = sighandler;
4424 act.sa_mask = caught_signals;
4425 act.sa_flags = 0;
4427 for (i = 0; i < nsigs; i++)
4428 if (sigismember (&caught_signals, sig[i]))
4429 sigaction (sig[i], &act, nullptr);
4430 #else
4431 for (i = 0; i < nsigs; i++)
4432 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4434 signal (sig[i], sighandler);
4435 siginterrupt (sig[i], 1);
4437 #endif
4439 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4441 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4442 atexit (exit_cleanup);
4444 key_init (&gkey);
4445 gkey.sword = SIZE_MAX;
4447 files = xnmalloc (argc, sizeof *files);
4449 while (true)
4451 /* Parse an operand as a file after "--" was seen; or if
4452 pedantic and a file was seen, unless the POSIX version
4453 is not 1003.1-2001 and -c was not seen and the operand is
4454 "-o FILE" or "-oFILE". */
4455 int oi = -1;
4457 if (c == -1
4458 || (posixly_correct && nfiles != 0
4459 && ! (traditional_usage
4460 && ! checkonly
4461 && optind != argc
4462 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4463 && (argv[optind][2] || optind + 1 != argc)))
4464 || ((c = getopt_long (argc, argv, short_options,
4465 long_options, &oi))
4466 == -1))
4468 if (argc <= optind)
4469 break;
4470 files[nfiles++] = argv[optind++];
4472 else switch (c)
4474 case 1:
4475 key = nullptr;
4476 if (optarg[0] == '+')
4478 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4479 && ISDIGIT (argv[optind][1]));
4480 traditional_usage |= minus_pos_usage && !posixly_correct;
4481 if (traditional_usage)
4483 /* Treat +POS1 [-POS2] as a key if possible; but silently
4484 treat an operand as a file if it is not a valid +POS1. */
4485 key = key_init (&key_buf);
4486 s = parse_field_count (optarg + 1, &key->sword, nullptr);
4487 if (s && *s == '.')
4488 s = parse_field_count (s + 1, &key->schar, nullptr);
4489 if (! (key->sword || key->schar))
4490 key->sword = SIZE_MAX;
4491 if (! s || *set_ordering (s, key, bl_start))
4492 key = nullptr;
4493 else
4495 if (minus_pos_usage)
4497 char const *optarg1 = argv[optind++];
4498 s = parse_field_count (optarg1 + 1, &key->eword,
4499 N_("invalid number after '-'"));
4500 if (*s == '.')
4501 s = parse_field_count (s + 1, &key->echar,
4502 N_("invalid number after '.'"));
4503 if (!key->echar && key->eword)
4505 /* obsolescent syntax +A.x -B.y is equivalent to:
4506 -k A+1.x+1,B.y (when y = 0)
4507 -k A+1.x+1,B+1.y (when y > 0)
4508 So eword is decremented as in the -k case
4509 only when the end field (B) is specified and
4510 echar (y) is 0. */
4511 key->eword--;
4513 if (*set_ordering (s, key, bl_end))
4514 badfieldspec (optarg1,
4515 N_("stray character in field spec"));
4517 key->traditional_used = true;
4518 insertkey (key);
4522 if (! key)
4523 files[nfiles++] = optarg;
4524 break;
4526 case SORT_OPTION:
4527 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4528 FALLTHROUGH;
4529 case 'b':
4530 case 'd':
4531 case 'f':
4532 case 'g':
4533 case 'h':
4534 case 'i':
4535 case 'M':
4536 case 'n':
4537 case 'r':
4538 case 'R':
4539 case 'V':
4541 char str[2];
4542 str[0] = c;
4543 str[1] = '\0';
4544 set_ordering (str, &gkey, bl_both);
4546 break;
4548 case CHECK_OPTION:
4549 c = (optarg
4550 ? XARGMATCH ("--check", optarg, check_args, check_types)
4551 : 'c');
4552 FALLTHROUGH;
4553 case 'c':
4554 case 'C':
4555 if (checkonly && checkonly != c)
4556 incompatible_options ("cC");
4557 checkonly = c;
4558 break;
4560 case COMPRESS_PROGRAM_OPTION:
4561 if (compress_program && !STREQ (compress_program, optarg))
4562 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4563 compress_program = optarg;
4564 break;
4566 case DEBUG_PROGRAM_OPTION:
4567 debug = true;
4568 break;
4570 case FILES0_FROM_OPTION:
4571 files_from = optarg;
4572 break;
4574 case 'k':
4575 key = key_init (&key_buf);
4577 /* Get POS1. */
4578 s = parse_field_count (optarg, &key->sword,
4579 N_("invalid number at field start"));
4580 if (! key->sword--)
4582 /* Provoke with 'sort -k0' */
4583 badfieldspec (optarg, N_("field number is zero"));
4585 if (*s == '.')
4587 s = parse_field_count (s + 1, &key->schar,
4588 N_("invalid number after '.'"));
4589 if (! key->schar--)
4591 /* Provoke with 'sort -k1.0' */
4592 badfieldspec (optarg, N_("character offset is zero"));
4595 if (! (key->sword || key->schar))
4596 key->sword = SIZE_MAX;
4597 s = set_ordering (s, key, bl_start);
4598 if (*s != ',')
4600 key->eword = SIZE_MAX;
4601 key->echar = 0;
4603 else
4605 /* Get POS2. */
4606 s = parse_field_count (s + 1, &key->eword,
4607 N_("invalid number after ','"));
4608 if (! key->eword--)
4610 /* Provoke with 'sort -k1,0' */
4611 badfieldspec (optarg, N_("field number is zero"));
4613 if (*s == '.')
4615 s = parse_field_count (s + 1, &key->echar,
4616 N_("invalid number after '.'"));
4618 s = set_ordering (s, key, bl_end);
4620 if (*s)
4621 badfieldspec (optarg, N_("stray character in field spec"));
4622 insertkey (key);
4623 break;
4625 case 'm':
4626 mergeonly = true;
4627 break;
4629 case NMERGE_OPTION:
4630 specify_nmerge (oi, c, optarg);
4631 break;
4633 case 'o':
4634 if (outfile && !STREQ (outfile, optarg))
4635 error (SORT_FAILURE, 0, _("multiple output files specified"));
4636 outfile = optarg;
4637 break;
4639 case RANDOM_SOURCE_OPTION:
4640 if (random_source && !STREQ (random_source, optarg))
4641 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4642 random_source = optarg;
4643 break;
4645 case 's':
4646 stable = true;
4647 break;
4649 case 'S':
4650 specify_sort_size (oi, c, optarg);
4651 break;
4653 case 't':
4655 char newtab = optarg[0];
4656 if (! newtab)
4657 error (SORT_FAILURE, 0, _("empty tab"));
4658 if (optarg[1])
4660 if (STREQ (optarg, "\\0"))
4661 newtab = '\0';
4662 else
4664 /* Provoke with 'sort -txx'. Complain about
4665 "multi-character tab" instead of "multibyte tab", so
4666 that the diagnostic's wording does not need to be
4667 changed once multibyte characters are supported. */
4668 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4669 quote (optarg));
4672 if (tab != TAB_DEFAULT && tab != newtab)
4673 error (SORT_FAILURE, 0, _("incompatible tabs"));
4674 tab = newtab;
4676 break;
4678 case 'T':
4679 add_temp_dir (optarg);
4680 break;
4682 case PARALLEL_OPTION:
4683 nthreads = specify_nthreads (oi, c, optarg);
4684 break;
4686 case 'u':
4687 unique = true;
4688 break;
4690 case 'y':
4691 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4692 through Solaris 7. It is also accepted by many non-Solaris
4693 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4694 -y is marked as obsolete starting with Solaris 8 (1999), but is
4695 still accepted as of Solaris 10 prerelease (2004).
4697 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4698 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4699 and which in general ignores the argument after "-y" if it
4700 consists entirely of digits (it can even be empty). */
4701 if (optarg == argv[optind - 1])
4703 char const *p;
4704 for (p = optarg; ISDIGIT (*p); p++)
4705 continue;
4706 optind -= (*p != '\0');
4708 break;
4710 case 'z':
4711 eolchar = 0;
4712 break;
4714 case_GETOPT_HELP_CHAR;
4716 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4718 default:
4719 usage (SORT_FAILURE);
4723 if (files_from)
4725 /* When using --files0-from=F, you may not specify any files
4726 on the command-line. */
4727 if (nfiles)
4729 error (0, 0, _("extra operand %s"), quoteaf (files[0]));
4730 fprintf (stderr, "%s\n",
4731 _("file operands cannot be combined with --files0-from"));
4732 usage (SORT_FAILURE);
4735 FILE *stream = xfopen (files_from, "r");
4737 readtokens0_init (&tok);
4739 if (! readtokens0 (stream, &tok))
4740 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4741 quoteaf (files_from));
4742 xfclose (stream, files_from);
4744 if (tok.n_tok)
4746 free (files);
4747 files = tok.tok;
4748 nfiles = tok.n_tok;
4749 for (size_t i = 0; i < nfiles; i++)
4751 if (STREQ (files[i], "-"))
4752 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4753 "no file name of %s allowed"),
4754 quoteaf (files[i]));
4755 else if (files[i][0] == '\0')
4757 /* Using the standard 'filename:line-number:' prefix here is
4758 not totally appropriate, since NUL is the separator,
4759 not NL, but it might be better than nothing. */
4760 unsigned long int file_number = i + 1;
4761 error (SORT_FAILURE, 0,
4762 _("%s:%lu: invalid zero-length file name"),
4763 quotef (files_from), file_number);
4767 else
4768 error (SORT_FAILURE, 0, _("no input from %s"),
4769 quoteaf (files_from));
4772 /* Inheritance of global options to individual keys. */
4773 for (key = keylist; key; key = key->next)
4775 if (default_key_compare (key) && !key->reverse)
4777 key->ignore = gkey.ignore;
4778 key->translate = gkey.translate;
4779 key->skipsblanks = gkey.skipsblanks;
4780 key->skipeblanks = gkey.skipeblanks;
4781 key->month = gkey.month;
4782 key->numeric = gkey.numeric;
4783 key->general_numeric = gkey.general_numeric;
4784 key->human_numeric = gkey.human_numeric;
4785 key->version = gkey.version;
4786 key->random = gkey.random;
4787 key->reverse = gkey.reverse;
4790 need_random |= key->random;
4793 if (!keylist && !default_key_compare (&gkey))
4795 gkey_only = true;
4796 insertkey (&gkey);
4797 need_random |= gkey.random;
4800 check_ordering_compatibility ();
4802 if (debug)
4804 if (checkonly || outfile)
4806 static char opts[] = "X --debug";
4807 opts[0] = (checkonly ? checkonly : 'o');
4808 incompatible_options (opts);
4811 /* Always output the locale in debug mode, since this
4812 is such a common source of confusion. */
4814 /* OpenBSD can only set some categories with LC_ALL above,
4815 so set LC_COLLATE explicitly to flag errors. */
4816 if (locale_ok)
4817 locale_ok = !! setlocale (LC_COLLATE, "");
4818 if (! locale_ok)
4819 error (0, 0, "%s", _("failed to set locale"));
4820 if (hard_LC_COLLATE)
4821 error (0, 0, _("text ordering performed using %s sorting rules"),
4822 quote (setlocale (LC_COLLATE, nullptr)));
4823 else
4824 error (0, 0, "%s",
4825 _("text ordering performed using simple byte comparison"));
4827 key_warnings (&gkey, gkey_only);
4830 reverse = gkey.reverse;
4832 if (need_random)
4833 random_md5_state_init (random_source);
4835 if (temp_dir_count == 0)
4837 char const *tmp_dir = getenv ("TMPDIR");
4838 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4841 if (nfiles == 0)
4843 nfiles = 1;
4844 free (files);
4845 files = xmalloc (sizeof *files);
4846 *files = (char *) "-";
4849 /* Need to re-check that we meet the minimum requirement for memory
4850 usage with the final value for NMERGE. */
4851 if (0 < sort_size)
4852 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4854 if (checkonly)
4856 if (nfiles > 1)
4857 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4858 quoteaf (files[1]), checkonly);
4860 if (outfile)
4862 static char opts[] = {0, 'o', 0};
4863 opts[0] = checkonly;
4864 incompatible_options (opts);
4867 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4868 input is not properly sorted. */
4869 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4872 /* Check all inputs are accessible, or exit immediately. */
4873 check_inputs (files, nfiles);
4875 /* Check output is writable, or exit immediately. */
4876 check_output (outfile);
4878 if (mergeonly)
4880 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4882 for (size_t i = 0; i < nfiles; ++i)
4883 sortfiles[i].name = files[i];
4885 merge (sortfiles, 0, nfiles, outfile);
4887 else
4889 if (!nthreads)
4891 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4892 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4895 /* Avoid integer overflow later. */
4896 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4897 nthreads = MIN (nthreads, nthreads_max);
4899 sort (files, nfiles, outfile, nthreads);
4902 if (have_read_stdin && fclose (stdin) == EOF)
4903 sort_die (_("close failed"), "-");
4905 main_exit (EXIT_SUCCESS);