maint: add doc/coverage to .gitignore
[coreutils.git] / src / sort.c
blob4b6b61fcdfbac8b73d1eca90b11dafbe5cc9caea
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/resource.h>
28 #include <sys/types.h>
29 #include <sys/wait.h>
30 #include <signal.h>
31 #include <assert.h>
32 #include "system.h"
33 #include "argmatch.h"
34 #include "die.h"
35 #include "error.h"
36 #include "fadvise.h"
37 #include "filevercmp.h"
38 #include "flexmember.h"
39 #include "hard-locale.h"
40 #include "hash.h"
41 #include "heap.h"
42 #include "ignore-value.h"
43 #include "md5.h"
44 #include "mbswidth.h"
45 #include "nproc.h"
46 #include "physmem.h"
47 #include "posixver.h"
48 #include "quote.h"
49 #include "randread.h"
50 #include "readtokens0.h"
51 #include "stdlib--.h"
52 #include "strnumcmp.h"
53 #include "xmemcoll.h"
54 #include "xnanosleep.h"
55 #include "xstrtol.h"
57 #ifndef RLIMIT_DATA
58 struct rlimit { size_t rlim_cur; };
59 # define getrlimit(Resource, Rlp) (-1)
60 #endif
62 /* The official name of this program (e.g., no 'g' prefix). */
63 #define PROGRAM_NAME "sort"
65 #define AUTHORS \
66 proper_name ("Mike Haertel"), \
67 proper_name ("Paul Eggert")
69 #if HAVE_LANGINFO_CODESET
70 # include <langinfo.h>
71 #endif
73 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
74 present. */
75 #ifndef SA_NOCLDSTOP
76 # define SA_NOCLDSTOP 0
77 /* No sigprocmask. Always 'return' zero. */
78 # define sigprocmask(How, Set, Oset) (0)
79 # define sigset_t int
80 # if ! HAVE_SIGINTERRUPT
81 # define siginterrupt(sig, flag) /* empty */
82 # endif
83 #endif
85 #if GNULIB_defined_pthread_functions
86 # undef pthread_sigmask
87 # define pthread_sigmask(how, set, oset) sigprocmask (how, set, oset)
88 #endif
90 #if !defined OPEN_MAX && defined NR_OPEN
91 # define OPEN_MAX NR_OPEN
92 #endif
93 #if !defined OPEN_MAX
94 # define OPEN_MAX 20
95 #endif
97 #define UCHAR_LIM (UCHAR_MAX + 1)
99 #if HAVE_C99_STRTOLD
100 # define long_double long double
101 #else
102 # define long_double double
103 # undef strtold
104 # define strtold strtod
105 #endif
107 #ifndef DEFAULT_TMPDIR
108 # define DEFAULT_TMPDIR "/tmp"
109 #endif
111 /* Maximum number of lines to merge every time a NODE is taken from
112 the merge queue. Node is at LEVEL in the binary merge tree,
113 and is responsible for merging TOTAL lines. */
114 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
116 /* Heuristic value for the number of lines for which it is worth creating
117 a subthread, during an internal merge sort. I.e., it is a small number
118 of "average" lines for which sorting via two threads is faster than
119 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
120 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
121 lines of gensort -a output is sorted slightly faster with --parallel=2
122 than with --parallel=1. By contrast, using --parallel=1 is about 10%
123 faster than using --parallel=2 with a 64K-line input. */
124 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
125 verify (4 <= SUBTHREAD_LINES_HEURISTIC);
127 /* The number of threads after which there are
128 diminishing performance gains. */
129 enum { DEFAULT_MAX_THREADS = 8 };
131 /* Exit statuses. */
132 enum
134 /* POSIX says to exit with status 1 if invoked with -c and the
135 input is not properly sorted. */
136 SORT_OUT_OF_ORDER = 1,
138 /* POSIX says any other irregular exit must exit with a status
139 code greater than 1. */
140 SORT_FAILURE = 2
143 enum
145 /* The number of times we should try to fork a compression process
146 (we retry if the fork call fails). We don't _need_ to compress
147 temp files, this is just to reduce disk access, so this number
148 can be small. Each retry doubles in duration. */
149 MAX_FORK_TRIES_COMPRESS = 4,
151 /* The number of times we should try to fork a decompression process.
152 If we can't fork a decompression process, we can't sort, so this
153 number should be big. Each retry doubles in duration. */
154 MAX_FORK_TRIES_DECOMPRESS = 9
157 enum
159 /* Level of the end-of-merge node, one level above the root. */
160 MERGE_END = 0,
162 /* Level of the root node in merge tree. */
163 MERGE_ROOT = 1
166 /* The representation of the decimal point in the current locale. */
167 static int decimal_point;
169 /* Thousands separator; if -1, then there isn't one. */
170 static int thousands_sep;
172 /* Nonzero if the corresponding locales are hard. */
173 static bool hard_LC_COLLATE;
174 #if HAVE_NL_LANGINFO
175 static bool hard_LC_TIME;
176 #endif
178 #define NONZERO(x) ((x) != 0)
180 /* The kind of blanks for '-b' to skip in various options. */
181 enum blanktype { bl_start, bl_end, bl_both };
183 /* The character marking end of line. Default to \n. */
184 static char eolchar = '\n';
186 /* Lines are held in core as counted strings. */
187 struct line
189 char *text; /* Text of the line. */
190 size_t length; /* Length including final newline. */
191 char *keybeg; /* Start of first key. */
192 char *keylim; /* Limit of first key. */
195 /* Input buffers. */
196 struct buffer
198 char *buf; /* Dynamically allocated buffer,
199 partitioned into 3 regions:
200 - input data;
201 - unused area;
202 - an array of lines, in reverse order. */
203 size_t used; /* Number of bytes used for input data. */
204 size_t nlines; /* Number of lines in the line array. */
205 size_t alloc; /* Number of bytes allocated. */
206 size_t left; /* Number of bytes left from previous reads. */
207 size_t line_bytes; /* Number of bytes to reserve for each line. */
208 bool eof; /* An EOF has been read. */
211 /* Sort key. */
212 struct keyfield
214 size_t sword; /* Zero-origin 'word' to start at. */
215 size_t schar; /* Additional characters to skip. */
216 size_t eword; /* Zero-origin last 'word' of key. */
217 size_t echar; /* Additional characters in field. */
218 bool const *ignore; /* Boolean array of characters to ignore. */
219 char const *translate; /* Translation applied to characters. */
220 bool skipsblanks; /* Skip leading blanks when finding start. */
221 bool skipeblanks; /* Skip leading blanks when finding end. */
222 bool numeric; /* Flag for numeric comparison. Handle
223 strings of digits with optional decimal
224 point, but no exponential notation. */
225 bool random; /* Sort by random hash of key. */
226 bool general_numeric; /* Flag for general, numeric comparison.
227 Handle numbers in exponential notation. */
228 bool human_numeric; /* Flag for sorting by human readable
229 units with either SI or IEC prefixes. */
230 bool month; /* Flag for comparison by month name. */
231 bool reverse; /* Reverse the sense of comparison. */
232 bool version; /* sort by version number */
233 bool traditional_used; /* Traditional key option format is used. */
234 struct keyfield *next; /* Next keyfield to try. */
237 struct month
239 char const *name;
240 int val;
243 /* Binary merge tree node. */
244 struct merge_node
246 struct line *lo; /* Lines to merge from LO child node. */
247 struct line *hi; /* Lines to merge from HI child node. */
248 struct line *end_lo; /* End of available lines from LO. */
249 struct line *end_hi; /* End of available lines from HI. */
250 struct line **dest; /* Pointer to destination of merge. */
251 size_t nlo; /* Total Lines remaining from LO. */
252 size_t nhi; /* Total lines remaining from HI. */
253 struct merge_node *parent; /* Parent node. */
254 struct merge_node *lo_child; /* LO child node. */
255 struct merge_node *hi_child; /* HI child node. */
256 unsigned int level; /* Level in merge tree. */
257 bool queued; /* Node is already in heap. */
258 pthread_mutex_t lock; /* Lock for node operations. */
261 /* Priority queue of merge nodes. */
262 struct merge_node_queue
264 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
265 pthread_mutex_t mutex; /* Lock for queue operations. */
266 pthread_cond_t cond; /* Conditional wait for empty queue to populate
267 when popping. */
270 /* Used to implement --unique (-u). */
271 static struct line saved_line;
273 /* FIXME: None of these tables work with multibyte character sets.
274 Also, there are many other bugs when handling multibyte characters.
275 One way to fix this is to rewrite 'sort' to use wide characters
276 internally, but doing this with good performance is a bit
277 tricky. */
279 /* Table of blanks. */
280 static bool blanks[UCHAR_LIM];
282 /* Table of non-printing characters. */
283 static bool nonprinting[UCHAR_LIM];
285 /* Table of non-dictionary characters (not letters, digits, or blanks). */
286 static bool nondictionary[UCHAR_LIM];
288 /* Translation table folding lower case to upper. */
289 static char fold_toupper[UCHAR_LIM];
291 #define MONTHS_PER_YEAR 12
293 /* Table mapping month names to integers.
294 Alphabetic order allows binary search. */
295 static struct month monthtab[] =
297 {"APR", 4},
298 {"AUG", 8},
299 {"DEC", 12},
300 {"FEB", 2},
301 {"JAN", 1},
302 {"JUL", 7},
303 {"JUN", 6},
304 {"MAR", 3},
305 {"MAY", 5},
306 {"NOV", 11},
307 {"OCT", 10},
308 {"SEP", 9}
311 /* During the merge phase, the number of files to merge at once. */
312 #define NMERGE_DEFAULT 16
314 /* Minimum size for a merge or check buffer. */
315 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
317 /* Minimum sort size; the code might not work with smaller sizes. */
318 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
320 /* The number of bytes needed for a merge or check buffer, which can
321 function relatively efficiently even if it holds only one line. If
322 a longer line is seen, this value is increased. */
323 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
325 /* The approximate maximum number of bytes of main memory to use, as
326 specified by the user. Zero if the user has not specified a size. */
327 static size_t sort_size;
329 /* The initial allocation factor for non-regular files.
330 This is used, e.g., when reading from a pipe.
331 Don't make it too big, since it is multiplied by ~130 to
332 obtain the size of the actual buffer sort will allocate.
333 Also, there may be 8 threads all doing this at the same time. */
334 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
336 /* Array of directory names in which any temporary files are to be created. */
337 static char const **temp_dirs;
339 /* Number of temporary directory names used. */
340 static size_t temp_dir_count;
342 /* Number of allocated slots in temp_dirs. */
343 static size_t temp_dir_alloc;
345 /* Flag to reverse the order of all comparisons. */
346 static bool reverse;
348 /* Flag for stable sort. This turns off the last ditch bytewise
349 comparison of lines, and instead leaves lines in the same order
350 they were read if all keys compare equal. */
351 static bool stable;
353 /* If TAB has this value, blanks separate fields. */
354 enum { TAB_DEFAULT = CHAR_MAX + 1 };
356 /* Tab character separating fields. If TAB_DEFAULT, then fields are
357 separated by the empty string between a non-blank character and a blank
358 character. */
359 static int tab = TAB_DEFAULT;
361 /* Flag to remove consecutive duplicate lines from the output.
362 Only the last of a sequence of equal lines will be output. */
363 static bool unique;
365 /* Nonzero if any of the input files are the standard input. */
366 static bool have_read_stdin;
368 /* List of key field comparisons to be tried. */
369 static struct keyfield *keylist;
371 /* Program used to (de)compress temp files. Must accept -d. */
372 static char const *compress_program;
374 /* Annotate the output with extra info to aid the user. */
375 static bool debug;
377 /* Maximum number of files to merge in one go. If more than this
378 number are present, temp files will be used. */
379 static unsigned int nmerge = NMERGE_DEFAULT;
381 /* Output an error to stderr and exit using async-signal-safe routines.
382 This can be used safely from signal handlers,
383 and between fork and exec of multithreaded processes. */
385 static void async_safe_die (int, const char *) ATTRIBUTE_NORETURN;
386 static void
387 async_safe_die (int errnum, const char *errstr)
389 ignore_value (write (STDERR_FILENO, errstr, strlen (errstr)));
391 /* Even if defined HAVE_STRERROR_R, we can't use it,
392 as it may return a translated string etc. and even if not
393 may call malloc which is unsafe. We might improve this
394 by testing for sys_errlist and using that if available.
395 For now just report the error number. */
396 if (errnum)
398 char errbuf[INT_BUFSIZE_BOUND (errnum)];
399 char *p = inttostr (errnum, errbuf);
400 ignore_value (write (STDERR_FILENO, ": errno ", 8));
401 ignore_value (write (STDERR_FILENO, p, strlen (p)));
404 ignore_value (write (STDERR_FILENO, "\n", 1));
406 _exit (SORT_FAILURE);
409 /* Report MESSAGE for FILE, then clean up and exit.
410 If FILE is null, it represents standard output. */
412 static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN;
413 static void
414 sort_die (char const *message, char const *file)
416 die (SORT_FAILURE, errno, "%s: %s", message,
417 quotef (file ? file : _("standard output")));
420 void
421 usage (int status)
423 if (status != EXIT_SUCCESS)
424 emit_try_help ();
425 else
427 printf (_("\
428 Usage: %s [OPTION]... [FILE]...\n\
429 or: %s [OPTION]... --files0-from=F\n\
431 program_name, program_name);
432 fputs (_("\
433 Write sorted concatenation of all FILE(s) to standard output.\n\
434 "), stdout);
436 emit_stdin_note ();
437 emit_mandatory_arg_note ();
439 fputs (_("\
440 Ordering options:\n\
442 "), stdout);
443 fputs (_("\
444 -b, --ignore-leading-blanks ignore leading blanks\n\
445 -d, --dictionary-order consider only blanks and alphanumeric characters\
447 -f, --ignore-case fold lower case to upper case characters\n\
448 "), stdout);
449 fputs (_("\
450 -g, --general-numeric-sort compare according to general numerical value\n\
451 -i, --ignore-nonprinting consider only printable characters\n\
452 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
453 "), stdout);
454 fputs (_("\
455 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
456 "), stdout);
457 fputs (_("\
458 -n, --numeric-sort compare according to string numerical value\n\
459 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
460 --random-source=FILE get random bytes from FILE\n\
461 -r, --reverse reverse the result of comparisons\n\
462 "), stdout);
463 fputs (_("\
464 --sort=WORD sort according to WORD:\n\
465 general-numeric -g, human-numeric -h, month -M,\
467 numeric -n, random -R, version -V\n\
468 -V, --version-sort natural sort of (version) numbers within text\n\
470 "), stdout);
471 fputs (_("\
472 Other options:\n\
474 "), stdout);
475 fputs (_("\
476 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
477 for more use temp files\n\
478 "), stdout);
479 fputs (_("\
480 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
481 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
483 --compress-program=PROG compress temporaries with PROG;\n\
484 decompress them with PROG -d\n\
485 "), stdout);
486 fputs (_("\
487 --debug annotate the part of the line used to sort,\n\
488 and warn about questionable usage to stderr\n\
489 --files0-from=F read input from the files specified by\n\
490 NUL-terminated names in file F;\n\
491 If F is - then read names from standard input\n\
492 "), stdout);
493 fputs (_("\
494 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
495 -m, --merge merge already sorted files; do not sort\n\
496 "), stdout);
497 fputs (_("\
498 -o, --output=FILE write result to FILE instead of standard output\n\
499 -s, --stable stabilize sort by disabling last-resort comparison\
501 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
502 "), stdout);
503 printf (_("\
504 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
505 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
506 multiple options specify multiple directories\n\
507 --parallel=N change the number of sorts run concurrently to N\n\
508 -u, --unique with -c, check for strict ordering;\n\
509 without -c, output only the first of an equal run\
511 "), DEFAULT_TMPDIR);
512 fputs (_("\
513 -z, --zero-terminated line delimiter is NUL, not newline\n\
514 "), stdout);
515 fputs (HELP_OPTION_DESCRIPTION, stdout);
516 fputs (VERSION_OPTION_DESCRIPTION, stdout);
517 fputs (_("\
519 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
520 field number and C a character position in the field; both are origin 1, and\n\
521 the stop position defaults to the line's end. If neither -t nor -b is in\n\
522 effect, characters in a field are counted from the beginning of the preceding\n\
523 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
525 which override global ordering options for that key. If no key is given, use\n\
526 the entire line as the key. Use --debug to diagnose incorrect key usage.\n\
528 SIZE may be followed by the following multiplicative suffixes:\n\
529 "), stdout);
530 fputs (_("\
531 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
533 *** WARNING ***\n\
534 The locale specified by the environment affects sort order.\n\
535 Set LC_ALL=C to get the traditional sort order that uses\n\
536 native byte values.\n\
537 "), stdout );
538 emit_ancillary_info (PROGRAM_NAME);
541 exit (status);
544 /* For long options that have no equivalent short option, use a
545 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
546 enum
548 CHECK_OPTION = CHAR_MAX + 1,
549 COMPRESS_PROGRAM_OPTION,
550 DEBUG_PROGRAM_OPTION,
551 FILES0_FROM_OPTION,
552 NMERGE_OPTION,
553 RANDOM_SOURCE_OPTION,
554 SORT_OPTION,
555 PARALLEL_OPTION
558 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
560 static struct option const long_options[] =
562 {"ignore-leading-blanks", no_argument, NULL, 'b'},
563 {"check", optional_argument, NULL, CHECK_OPTION},
564 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
565 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
566 {"dictionary-order", no_argument, NULL, 'd'},
567 {"ignore-case", no_argument, NULL, 'f'},
568 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
569 {"general-numeric-sort", no_argument, NULL, 'g'},
570 {"ignore-nonprinting", no_argument, NULL, 'i'},
571 {"key", required_argument, NULL, 'k'},
572 {"merge", no_argument, NULL, 'm'},
573 {"month-sort", no_argument, NULL, 'M'},
574 {"numeric-sort", no_argument, NULL, 'n'},
575 {"human-numeric-sort", no_argument, NULL, 'h'},
576 {"version-sort", no_argument, NULL, 'V'},
577 {"random-sort", no_argument, NULL, 'R'},
578 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
579 {"sort", required_argument, NULL, SORT_OPTION},
580 {"output", required_argument, NULL, 'o'},
581 {"reverse", no_argument, NULL, 'r'},
582 {"stable", no_argument, NULL, 's'},
583 {"batch-size", required_argument, NULL, NMERGE_OPTION},
584 {"buffer-size", required_argument, NULL, 'S'},
585 {"field-separator", required_argument, NULL, 't'},
586 {"temporary-directory", required_argument, NULL, 'T'},
587 {"unique", no_argument, NULL, 'u'},
588 {"zero-terminated", no_argument, NULL, 'z'},
589 {"parallel", required_argument, NULL, PARALLEL_OPTION},
590 {GETOPT_HELP_OPTION_DECL},
591 {GETOPT_VERSION_OPTION_DECL},
592 {NULL, 0, NULL, 0},
595 #define CHECK_TABLE \
596 _ct_("quiet", 'C') \
597 _ct_("silent", 'C') \
598 _ct_("diagnose-first", 'c')
600 static char const *const check_args[] =
602 #define _ct_(_s, _c) _s,
603 CHECK_TABLE NULL
604 #undef _ct_
606 static char const check_types[] =
608 #define _ct_(_s, _c) _c,
609 CHECK_TABLE
610 #undef _ct_
613 #define SORT_TABLE \
614 _st_("general-numeric", 'g') \
615 _st_("human-numeric", 'h') \
616 _st_("month", 'M') \
617 _st_("numeric", 'n') \
618 _st_("random", 'R') \
619 _st_("version", 'V')
621 static char const *const sort_args[] =
623 #define _st_(_s, _c) _s,
624 SORT_TABLE NULL
625 #undef _st_
627 static char const sort_types[] =
629 #define _st_(_s, _c) _c,
630 SORT_TABLE
631 #undef _st_
634 /* The set of signals that are caught. */
635 static sigset_t caught_signals;
637 /* Critical section status. */
638 struct cs_status
640 bool valid;
641 sigset_t sigs;
644 /* Enter a critical section. */
645 static void
646 cs_enter (struct cs_status *status)
648 int ret = pthread_sigmask (SIG_BLOCK, &caught_signals, &status->sigs);
649 status->valid = ret == 0;
652 /* Leave a critical section. */
653 static void
654 cs_leave (struct cs_status const *status)
656 if (status->valid)
658 /* Ignore failure when restoring the signal mask. */
659 pthread_sigmask (SIG_SETMASK, &status->sigs, NULL);
663 /* Possible states for a temp file. If compressed, the file's status
664 is unreaped or reaped, depending on whether 'sort' has waited for
665 the subprocess to finish. */
666 enum { UNCOMPRESSED, UNREAPED, REAPED };
668 /* The list of temporary files. */
669 struct tempnode
671 struct tempnode *volatile next;
672 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
673 char state;
674 char name[FLEXIBLE_ARRAY_MEMBER];
676 static struct tempnode *volatile temphead;
677 static struct tempnode *volatile *temptail = &temphead;
679 /* A file to be sorted. */
680 struct sortfile
682 /* The file's name. */
683 char const *name;
685 /* Non-null if this is a temporary file, in which case NAME == TEMP->name. */
686 struct tempnode *temp;
689 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
690 static Hash_table *proctab;
692 enum { INIT_PROCTAB_SIZE = 47 };
694 static size_t
695 proctab_hasher (void const *entry, size_t tabsize)
697 struct tempnode const *node = entry;
698 return node->pid % tabsize;
701 static bool
702 proctab_comparator (void const *e1, void const *e2)
704 struct tempnode const *n1 = e1;
705 struct tempnode const *n2 = e2;
706 return n1->pid == n2->pid;
709 /* The number of unreaped child processes. */
710 static pid_t nprocs;
712 static bool delete_proc (pid_t);
714 /* If PID is positive, wait for the child process with that PID to
715 exit, and assume that PID has already been removed from the process
716 table. If PID is 0 or -1, clean up some child that has exited (by
717 waiting for it, and removing it from the proc table) and return the
718 child's process ID. However, if PID is 0 and no children have
719 exited, return 0 without waiting. */
721 static pid_t
722 reap (pid_t pid)
724 int status;
725 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
727 if (cpid < 0)
728 die (SORT_FAILURE, errno, _("waiting for %s [-d]"),
729 quoteaf (compress_program));
730 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
732 if (! WIFEXITED (status) || WEXITSTATUS (status))
733 die (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
734 quoteaf (compress_program));
735 --nprocs;
738 return cpid;
741 /* TEMP represents a new process; add it to the process table. Create
742 the process table the first time it's called. */
744 static void
745 register_proc (struct tempnode *temp)
747 if (! proctab)
749 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
750 proctab_hasher,
751 proctab_comparator,
752 NULL);
753 if (! proctab)
754 xalloc_die ();
757 temp->state = UNREAPED;
759 if (! hash_insert (proctab, temp))
760 xalloc_die ();
763 /* If PID is in the process table, remove it and return true.
764 Otherwise, return false. */
766 static bool
767 delete_proc (pid_t pid)
769 struct tempnode test;
771 test.pid = pid;
772 struct tempnode *node = hash_delete (proctab, &test);
773 if (! node)
774 return false;
775 node->state = REAPED;
776 return true;
779 /* Remove PID from the process table, and wait for it to exit if it
780 hasn't already. */
782 static void
783 wait_proc (pid_t pid)
785 if (delete_proc (pid))
786 reap (pid);
789 /* Reap any exited children. Do not block; reap only those that have
790 already exited. */
792 static void
793 reap_exited (void)
795 while (0 < nprocs && reap (0))
796 continue;
799 /* Reap at least one exited child, waiting if necessary. */
801 static void
802 reap_some (void)
804 reap (-1);
805 reap_exited ();
808 /* Reap all children, waiting if necessary. */
810 static void
811 reap_all (void)
813 while (0 < nprocs)
814 reap (-1);
817 /* Clean up any remaining temporary files. */
819 static void
820 cleanup (void)
822 struct tempnode const *node;
824 for (node = temphead; node; node = node->next)
825 unlink (node->name);
826 temphead = NULL;
829 /* Cleanup actions to take when exiting. */
831 static void
832 exit_cleanup (void)
834 if (temphead)
836 /* Clean up any remaining temporary files in a critical section so
837 that a signal handler does not try to clean them too. */
838 struct cs_status cs;
839 cs_enter (&cs);
840 cleanup ();
841 cs_leave (&cs);
844 close_stdout ();
847 /* Create a new temporary file, returning its newly allocated tempnode.
848 Store into *PFD the file descriptor open for writing.
849 If the creation fails, return NULL and store -1 into *PFD if the
850 failure is due to file descriptor exhaustion and
851 SURVIVE_FD_EXHAUSTION; otherwise, die. */
853 static struct tempnode *
854 create_temp_file (int *pfd, bool survive_fd_exhaustion)
856 static char const slashbase[] = "/sortXXXXXX";
857 static size_t temp_dir_index;
858 int fd;
859 int saved_errno;
860 char const *temp_dir = temp_dirs[temp_dir_index];
861 size_t len = strlen (temp_dir);
862 struct tempnode *node =
863 xmalloc (FLEXSIZEOF (struct tempnode, name, len + sizeof slashbase));
864 char *file = node->name;
865 struct cs_status cs;
867 memcpy (file, temp_dir, len);
868 memcpy (file + len, slashbase, sizeof slashbase);
869 node->next = NULL;
870 if (++temp_dir_index == temp_dir_count)
871 temp_dir_index = 0;
873 /* Create the temporary file in a critical section, to avoid races. */
874 cs_enter (&cs);
875 fd = mkostemp (file, O_CLOEXEC);
876 if (0 <= fd)
878 *temptail = node;
879 temptail = &node->next;
881 saved_errno = errno;
882 cs_leave (&cs);
883 errno = saved_errno;
885 if (fd < 0)
887 if (! (survive_fd_exhaustion && errno == EMFILE))
888 die (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
889 quoteaf (temp_dir));
890 free (node);
891 node = NULL;
894 *pfd = fd;
895 return node;
898 /* Return a stream for FILE, opened with mode HOW. A null FILE means
899 standard output; HOW should be "w". When opening for input, "-"
900 means standard input. To avoid confusion, do not return file
901 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
902 opening an ordinary FILE. Return NULL if unsuccessful.
904 Use fadvise to specify an access pattern for input files.
905 There are a few hints we could possibly provide,
906 and after careful testing it was decided that
907 specifying FADVISE_SEQUENTIAL was not detrimental
908 to any cases. On Linux 2.6.31, this option doubles
909 the size of read ahead performed and thus was seen to
910 benefit these cases:
911 Merging
912 Sorting with a smaller internal buffer
913 Reading from faster flash devices
915 In _addition_ one could also specify other hints...
917 FADVISE_WILLNEED was tested, but Linux 2.6.31
918 at least uses that to _synchronously_ prepopulate the cache
919 with the specified range. While sort does need to
920 read all of its input before outputting, a synchronous
921 read of the whole file up front precludes any processing
922 that sort could do in parallel with the system doing
923 read ahead of the data. This was seen to have negative effects
924 in a couple of cases:
925 Merging
926 Sorting with a smaller internal buffer
927 This option was seen to shorten the runtime for sort
928 on a multicore system with lots of RAM and other processes
929 competing for CPU. It could be argued that more explicit
930 scheduling hints with 'nice' et. al. are more appropriate
931 for this situation.
933 FADVISE_NOREUSE is a possibility as it could lower
934 the priority of input data in the cache as sort will
935 only need to process it once. However its functionality
936 has changed over Linux kernel versions and as of 2.6.31
937 it does nothing and thus we can't depend on what it might
938 do in future.
940 FADVISE_DONTNEED is not appropriate for user specified
941 input files, but for temp files we do want to drop the
942 cache immediately after processing. This is done implicitly
943 however when the files are unlinked. */
945 static FILE *
946 stream_open (char const *file, char const *how)
948 FILE *fp;
950 if (*how == 'r')
952 if (STREQ (file, "-"))
954 have_read_stdin = true;
955 fp = stdin;
957 else
959 int fd = open (file, O_RDONLY | O_CLOEXEC);
960 fp = fd < 0 ? NULL : fdopen (fd, how);
962 fadvise (fp, FADVISE_SEQUENTIAL);
964 else if (*how == 'w')
966 if (file && ftruncate (STDOUT_FILENO, 0) != 0)
967 die (SORT_FAILURE, errno, _("%s: error truncating"),
968 quotef (file));
969 fp = stdout;
971 else
972 assert (!"unexpected mode passed to stream_open");
974 return fp;
977 /* Same as stream_open, except always return a non-null value; die on
978 failure. */
980 static FILE *
981 xfopen (char const *file, char const *how)
983 FILE *fp = stream_open (file, how);
984 if (!fp)
985 sort_die (_("open failed"), file);
986 return fp;
989 /* Close FP, whose name is FILE, and report any errors. */
991 static void
992 xfclose (FILE *fp, char const *file)
994 switch (fileno (fp))
996 case STDIN_FILENO:
997 /* Allow reading stdin from tty more than once. */
998 if (feof (fp))
999 clearerr (fp);
1000 break;
1002 case STDOUT_FILENO:
1003 /* Don't close stdout just yet. close_stdout does that. */
1004 if (fflush (fp) != 0)
1005 sort_die (_("fflush failed"), file);
1006 break;
1008 default:
1009 if (fclose (fp) != 0)
1010 sort_die (_("close failed"), file);
1011 break;
1015 /* Move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not close-on-exec. */
1017 static void
1018 move_fd (int oldfd, int newfd)
1020 if (oldfd != newfd)
1022 /* This should never fail for our usage. */
1023 dup2 (oldfd, newfd);
1024 close (oldfd);
1028 /* Fork a child process for piping to and do common cleanup. The
1029 TRIES parameter specifies how many times to try to fork before
1030 giving up. Return the PID of the child, or -1 (setting errno)
1031 on failure. */
1033 static pid_t
1034 pipe_fork (int pipefds[2], size_t tries)
1036 #if HAVE_WORKING_FORK
1037 struct tempnode *saved_temphead;
1038 int saved_errno;
1039 double wait_retry = 0.25;
1040 pid_t pid IF_LINT ( = -1);
1041 struct cs_status cs;
1043 if (pipe2 (pipefds, O_CLOEXEC) < 0)
1044 return -1;
1046 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1047 uncontrolled subprocess generation can hurt performance significantly.
1048 Allow at most NMERGE + 2 subprocesses, on the theory that there
1049 may be some useful parallelism by letting compression for the
1050 previous merge finish (1 subprocess) in parallel with the current
1051 merge (NMERGE + 1 subprocesses). */
1053 if (nmerge + 1 < nprocs)
1054 reap_some ();
1056 while (tries--)
1058 /* This is so the child process won't delete our temp files
1059 if it receives a signal before exec-ing. */
1060 cs_enter (&cs);
1061 saved_temphead = temphead;
1062 temphead = NULL;
1064 pid = fork ();
1065 saved_errno = errno;
1066 if (pid)
1067 temphead = saved_temphead;
1069 cs_leave (&cs);
1070 errno = saved_errno;
1072 if (0 <= pid || errno != EAGAIN)
1073 break;
1074 else
1076 xnanosleep (wait_retry);
1077 wait_retry *= 2;
1078 reap_exited ();
1082 if (pid < 0)
1084 saved_errno = errno;
1085 close (pipefds[0]);
1086 close (pipefds[1]);
1087 errno = saved_errno;
1089 else if (pid == 0)
1091 close (STDIN_FILENO);
1092 close (STDOUT_FILENO);
1094 else
1095 ++nprocs;
1097 return pid;
1099 #else /* ! HAVE_WORKING_FORK */
1100 return -1;
1101 #endif
1104 /* Create a temporary file and, if asked for, start a compressor
1105 to that file. Set *PFP to the file handle and return
1106 the address of the new temp node. If the creation
1107 fails, return NULL if the failure is due to file descriptor
1108 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1110 static struct tempnode *
1111 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1113 int tempfd;
1114 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1115 if (! node)
1116 return NULL;
1118 node->state = UNCOMPRESSED;
1120 if (compress_program)
1122 int pipefds[2];
1124 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1125 if (0 < node->pid)
1127 close (tempfd);
1128 close (pipefds[0]);
1129 tempfd = pipefds[1];
1131 register_proc (node);
1133 else if (node->pid == 0)
1135 /* Being the child of a multithreaded program before exec,
1136 we're restricted to calling async-signal-safe routines here. */
1137 close (pipefds[1]);
1138 move_fd (tempfd, STDOUT_FILENO);
1139 move_fd (pipefds[0], STDIN_FILENO);
1141 execlp (compress_program, compress_program, (char *) NULL);
1143 async_safe_die (errno, "couldn't execute compress program");
1147 *pfp = fdopen (tempfd, "w");
1148 if (! *pfp)
1149 sort_die (_("couldn't create temporary file"), node->name);
1151 return node;
1154 /* Create a temporary file and, if asked for, start a compressor
1155 to that file. Set *PFP to the file handle and return the address
1156 of the new temp node. Die on failure. */
1158 static struct tempnode *
1159 create_temp (FILE **pfp)
1161 return maybe_create_temp (pfp, false);
1164 /* Open a compressed temp file and start a decompression process through
1165 which to filter the input. Return NULL (setting errno to
1166 EMFILE) if we ran out of file descriptors, and die on any other
1167 kind of failure. */
1169 static FILE *
1170 open_temp (struct tempnode *temp)
1172 int tempfd, pipefds[2];
1173 FILE *fp = NULL;
1175 if (temp->state == UNREAPED)
1176 wait_proc (temp->pid);
1178 tempfd = open (temp->name, O_RDONLY);
1179 if (tempfd < 0)
1180 return NULL;
1182 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1184 switch (child)
1186 case -1:
1187 if (errno != EMFILE)
1188 die (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1189 quoteaf (compress_program));
1190 close (tempfd);
1191 errno = EMFILE;
1192 break;
1194 case 0:
1195 /* Being the child of a multithreaded program before exec,
1196 we're restricted to calling async-signal-safe routines here. */
1197 close (pipefds[0]);
1198 move_fd (tempfd, STDIN_FILENO);
1199 move_fd (pipefds[1], STDOUT_FILENO);
1201 execlp (compress_program, compress_program, "-d", (char *) NULL);
1203 async_safe_die (errno, "couldn't execute compress program (with -d)");
1205 default:
1206 temp->pid = child;
1207 register_proc (temp);
1208 close (tempfd);
1209 close (pipefds[1]);
1211 fp = fdopen (pipefds[0], "r");
1212 if (! fp)
1214 int saved_errno = errno;
1215 close (pipefds[0]);
1216 errno = saved_errno;
1218 break;
1221 return fp;
1224 /* Append DIR to the array of temporary directory names. */
1225 static void
1226 add_temp_dir (char const *dir)
1228 if (temp_dir_count == temp_dir_alloc)
1229 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1231 temp_dirs[temp_dir_count++] = dir;
1234 /* Remove NAME from the list of temporary files. */
1236 static void
1237 zaptemp (char const *name)
1239 struct tempnode *volatile *pnode;
1240 struct tempnode *node;
1241 struct tempnode *next;
1242 int unlink_status;
1243 int unlink_errno = 0;
1244 struct cs_status cs;
1246 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1247 continue;
1249 if (node->state == UNREAPED)
1250 wait_proc (node->pid);
1252 /* Unlink the temporary file in a critical section to avoid races. */
1253 next = node->next;
1254 cs_enter (&cs);
1255 unlink_status = unlink (name);
1256 unlink_errno = errno;
1257 *pnode = next;
1258 cs_leave (&cs);
1260 if (unlink_status != 0)
1261 error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
1262 if (! next)
1263 temptail = pnode;
1264 free (node);
1267 #if HAVE_NL_LANGINFO
1269 static int
1270 struct_month_cmp (void const *m1, void const *m2)
1272 struct month const *month1 = m1;
1273 struct month const *month2 = m2;
1274 return strcmp (month1->name, month2->name);
1277 #endif
1279 /* Initialize the character class tables. */
1281 static void
1282 inittables (void)
1284 size_t i;
1286 for (i = 0; i < UCHAR_LIM; ++i)
1288 blanks[i] = field_sep (i);
1289 nonprinting[i] = ! isprint (i);
1290 nondictionary[i] = ! isalnum (i) && ! field_sep (i);
1291 fold_toupper[i] = toupper (i);
1294 #if HAVE_NL_LANGINFO
1295 /* If we're not in the "C" locale, read different names for months. */
1296 if (hard_LC_TIME)
1298 for (i = 0; i < MONTHS_PER_YEAR; i++)
1300 char const *s;
1301 size_t s_len;
1302 size_t j, k;
1303 char *name;
1305 s = nl_langinfo (ABMON_1 + i);
1306 s_len = strlen (s);
1307 monthtab[i].name = name = xmalloc (s_len + 1);
1308 monthtab[i].val = i + 1;
1310 for (j = k = 0; j < s_len; j++)
1311 if (! isblank (to_uchar (s[j])))
1312 name[k++] = fold_toupper[to_uchar (s[j])];
1313 name[k] = '\0';
1315 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1317 #endif
1320 /* Specify how many inputs may be merged at once.
1321 This may be set on the command-line with the
1322 --batch-size option. */
1323 static void
1324 specify_nmerge (int oi, char c, char const *s)
1326 uintmax_t n;
1327 struct rlimit rlimit;
1328 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1330 /* Try to find out how many file descriptors we'll be able
1331 to open. We need at least nmerge + 3 (STDIN_FILENO,
1332 STDOUT_FILENO and STDERR_FILENO). */
1333 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1334 ? rlimit.rlim_cur
1335 : OPEN_MAX)
1336 - 3);
1338 if (e == LONGINT_OK)
1340 nmerge = n;
1341 if (nmerge != n)
1342 e = LONGINT_OVERFLOW;
1343 else
1345 if (nmerge < 2)
1347 error (0, 0, _("invalid --%s argument %s"),
1348 long_options[oi].name, quote (s));
1349 die (SORT_FAILURE, 0,
1350 _("minimum --%s argument is %s"),
1351 long_options[oi].name, quote ("2"));
1353 else if (max_nmerge < nmerge)
1355 e = LONGINT_OVERFLOW;
1357 else
1358 return;
1362 if (e == LONGINT_OVERFLOW)
1364 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1365 error (0, 0, _("--%s argument %s too large"),
1366 long_options[oi].name, quote (s));
1367 die (SORT_FAILURE, 0,
1368 _("maximum --%s argument with current rlimit is %s"),
1369 long_options[oi].name,
1370 uinttostr (max_nmerge, max_nmerge_buf));
1372 else
1373 xstrtol_fatal (e, oi, c, long_options, s);
1376 /* Specify the amount of main memory to use when sorting. */
1377 static void
1378 specify_sort_size (int oi, char c, char const *s)
1380 uintmax_t n;
1381 char *suffix;
1382 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1384 /* The default unit is KiB. */
1385 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1387 if (n <= UINTMAX_MAX / 1024)
1388 n *= 1024;
1389 else
1390 e = LONGINT_OVERFLOW;
1393 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1394 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1395 switch (suffix[0])
1397 case 'b':
1398 e = LONGINT_OK;
1399 break;
1401 case '%':
1403 double mem = physmem_total () * n / 100;
1405 /* Use "<", not "<=", to avoid problems with rounding. */
1406 if (mem < UINTMAX_MAX)
1408 n = mem;
1409 e = LONGINT_OK;
1411 else
1412 e = LONGINT_OVERFLOW;
1414 break;
1417 if (e == LONGINT_OK)
1419 /* If multiple sort sizes are specified, take the maximum, so
1420 that option order does not matter. */
1421 if (n < sort_size)
1422 return;
1424 sort_size = n;
1425 if (sort_size == n)
1427 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1428 return;
1431 e = LONGINT_OVERFLOW;
1434 xstrtol_fatal (e, oi, c, long_options, s);
1437 /* Specify the number of threads to spawn during internal sort. */
1438 static size_t
1439 specify_nthreads (int oi, char c, char const *s)
1441 unsigned long int nthreads;
1442 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1443 if (e == LONGINT_OVERFLOW)
1444 return SIZE_MAX;
1445 if (e != LONGINT_OK)
1446 xstrtol_fatal (e, oi, c, long_options, s);
1447 if (SIZE_MAX < nthreads)
1448 nthreads = SIZE_MAX;
1449 if (nthreads == 0)
1450 die (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1451 return nthreads;
1454 /* Return the default sort size. */
1455 static size_t
1456 default_sort_size (void)
1458 /* Let SIZE be MEM, but no more than the maximum object size,
1459 total memory, or system resource limits. Don't bother to check
1460 for values like RLIM_INFINITY since in practice they are not much
1461 less than SIZE_MAX. */
1462 size_t size = SIZE_MAX;
1463 struct rlimit rlimit;
1464 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1465 size = rlimit.rlim_cur;
1466 #ifdef RLIMIT_AS
1467 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1468 size = rlimit.rlim_cur;
1469 #endif
1471 /* Leave a large safety margin for the above limits, as failure can
1472 occur when they are exceeded. */
1473 size /= 2;
1475 #ifdef RLIMIT_RSS
1476 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1477 Exceeding RSS is not fatal, but can be quite slow. */
1478 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1479 size = rlimit.rlim_cur / 16 * 15;
1480 #endif
1482 /* Let MEM be available memory or 1/8 of total memory, whichever
1483 is greater. */
1484 double avail = physmem_available ();
1485 double total = physmem_total ();
1486 double mem = MAX (avail, total / 8);
1488 /* Leave a 1/4 margin for physical memory. */
1489 if (total * 0.75 < size)
1490 size = total * 0.75;
1492 /* Return the minimum of MEM and SIZE, but no less than
1493 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1494 right when only one argument is floating point. */
1495 if (mem < size)
1496 size = mem;
1497 return MAX (size, MIN_SORT_SIZE);
1500 /* Return the sort buffer size to use with the input files identified
1501 by FPS and FILES, which are alternate names of the same files.
1502 NFILES gives the number of input files; NFPS may be less. Assume
1503 that each input line requires LINE_BYTES extra bytes' worth of line
1504 information. Do not exceed the size bound specified by the user
1505 (or a default size bound, if the user does not specify one). */
1507 static size_t
1508 sort_buffer_size (FILE *const *fps, size_t nfps,
1509 char *const *files, size_t nfiles,
1510 size_t line_bytes)
1512 /* A bound on the input size. If zero, the bound hasn't been
1513 determined yet. */
1514 static size_t size_bound;
1516 /* In the worst case, each input byte is a newline. */
1517 size_t worst_case_per_input_byte = line_bytes + 1;
1519 /* Keep enough room for one extra input line and an extra byte.
1520 This extra room might be needed when preparing to read EOF. */
1521 size_t size = worst_case_per_input_byte + 1;
1523 for (size_t i = 0; i < nfiles; i++)
1525 struct stat st;
1526 off_t file_size;
1527 size_t worst_case;
1529 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1530 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1531 : stat (files[i], &st))
1532 != 0)
1533 sort_die (_("stat failed"), files[i]);
1535 if (S_ISREG (st.st_mode))
1536 file_size = st.st_size;
1537 else
1539 /* The file has unknown size. If the user specified a sort
1540 buffer size, use that; otherwise, guess the size. */
1541 if (sort_size)
1542 return sort_size;
1543 file_size = INPUT_FILE_SIZE_GUESS;
1546 if (! size_bound)
1548 size_bound = sort_size;
1549 if (! size_bound)
1550 size_bound = default_sort_size ();
1553 /* Add the amount of memory needed to represent the worst case
1554 where the input consists entirely of newlines followed by a
1555 single non-newline. Check for overflow. */
1556 worst_case = file_size * worst_case_per_input_byte + 1;
1557 if (file_size != worst_case / worst_case_per_input_byte
1558 || size_bound - size <= worst_case)
1559 return size_bound;
1560 size += worst_case;
1563 return size;
1566 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1567 must be at least sizeof (struct line). Allocate ALLOC bytes
1568 initially. */
1570 static void
1571 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1573 /* Ensure that the line array is properly aligned. If the desired
1574 size cannot be allocated, repeatedly halve it until allocation
1575 succeeds. The smaller allocation may hurt overall performance,
1576 but that's better than failing. */
1577 while (true)
1579 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1580 buf->buf = malloc (alloc);
1581 if (buf->buf)
1582 break;
1583 alloc /= 2;
1584 if (alloc <= line_bytes + 1)
1585 xalloc_die ();
1588 buf->line_bytes = line_bytes;
1589 buf->alloc = alloc;
1590 buf->used = buf->left = buf->nlines = 0;
1591 buf->eof = false;
1594 /* Return one past the limit of the line array. */
1596 static inline struct line *
1597 buffer_linelim (struct buffer const *buf)
1599 void *linelim = buf->buf + buf->alloc;
1600 return linelim;
1603 /* Return a pointer to the first character of the field specified
1604 by KEY in LINE. */
1606 static char *
1607 begfield (struct line const *line, struct keyfield const *key)
1609 char *ptr = line->text, *lim = ptr + line->length - 1;
1610 size_t sword = key->sword;
1611 size_t schar = key->schar;
1613 /* The leading field separator itself is included in a field when -t
1614 is absent. */
1616 if (tab != TAB_DEFAULT)
1617 while (ptr < lim && sword--)
1619 while (ptr < lim && *ptr != tab)
1620 ++ptr;
1621 if (ptr < lim)
1622 ++ptr;
1624 else
1625 while (ptr < lim && sword--)
1627 while (ptr < lim && blanks[to_uchar (*ptr)])
1628 ++ptr;
1629 while (ptr < lim && !blanks[to_uchar (*ptr)])
1630 ++ptr;
1633 /* If we're ignoring leading blanks when computing the Start
1634 of the field, skip past them here. */
1635 if (key->skipsblanks)
1636 while (ptr < lim && blanks[to_uchar (*ptr)])
1637 ++ptr;
1639 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1640 ptr = MIN (lim, ptr + schar);
1642 return ptr;
1645 /* Return the limit of (a pointer to the first character after) the field
1646 in LINE specified by KEY. */
1648 static char *
1649 limfield (struct line const *line, struct keyfield const *key)
1651 char *ptr = line->text, *lim = ptr + line->length - 1;
1652 size_t eword = key->eword, echar = key->echar;
1654 if (echar == 0)
1655 eword++; /* Skip all of end field. */
1657 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1658 whichever comes first. If there are more than EWORD fields, leave
1659 PTR pointing at the beginning of the field having zero-based index,
1660 EWORD. If a delimiter character was specified (via -t), then that
1661 'beginning' is the first character following the delimiting TAB.
1662 Otherwise, leave PTR pointing at the first 'blank' character after
1663 the preceding field. */
1664 if (tab != TAB_DEFAULT)
1665 while (ptr < lim && eword--)
1667 while (ptr < lim && *ptr != tab)
1668 ++ptr;
1669 if (ptr < lim && (eword || echar))
1670 ++ptr;
1672 else
1673 while (ptr < lim && eword--)
1675 while (ptr < lim && blanks[to_uchar (*ptr)])
1676 ++ptr;
1677 while (ptr < lim && !blanks[to_uchar (*ptr)])
1678 ++ptr;
1681 #ifdef POSIX_UNSPECIFIED
1682 /* The following block of code makes GNU sort incompatible with
1683 standard Unix sort, so it's ifdef'd out for now.
1684 The POSIX spec isn't clear on how to interpret this.
1685 FIXME: request clarification.
1687 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1688 Date: Thu, 30 May 96 12:20:41 -0400
1689 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1691 [...]I believe I've found another bug in 'sort'.
1693 $ cat /tmp/sort.in
1694 a b c 2 d
1695 pq rs 1 t
1696 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1697 a b c 2 d
1698 pq rs 1 t
1699 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1700 pq rs 1 t
1701 a b c 2 d
1703 Unix sort produced the answer I expected: sort on the single character
1704 in column 7. GNU sort produced different results, because it disagrees
1705 on the interpretation of the key-end spec "M.N". Unix sort reads this
1706 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1707 "skip M-1 fields, then either N-1 characters or the rest of the current
1708 field, whichever comes first". This extra clause applies only to
1709 key-ends, not key-starts.
1712 /* Make LIM point to the end of (one byte past) the current field. */
1713 if (tab != TAB_DEFAULT)
1715 char *newlim;
1716 newlim = memchr (ptr, tab, lim - ptr);
1717 if (newlim)
1718 lim = newlim;
1720 else
1722 char *newlim;
1723 newlim = ptr;
1724 while (newlim < lim && blanks[to_uchar (*newlim)])
1725 ++newlim;
1726 while (newlim < lim && !blanks[to_uchar (*newlim)])
1727 ++newlim;
1728 lim = newlim;
1730 #endif
1732 if (echar != 0) /* We need to skip over a portion of the end field. */
1734 /* If we're ignoring leading blanks when computing the End
1735 of the field, skip past them here. */
1736 if (key->skipeblanks)
1737 while (ptr < lim && blanks[to_uchar (*ptr)])
1738 ++ptr;
1740 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1741 ptr = MIN (lim, ptr + echar);
1744 return ptr;
1747 /* Fill BUF reading from FP, moving buf->left bytes from the end
1748 of buf->buf to the beginning first. If EOF is reached and the
1749 file wasn't terminated by a newline, supply one. Set up BUF's line
1750 table too. FILE is the name of the file corresponding to FP.
1751 Return true if some input was read. */
1753 static bool
1754 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1756 struct keyfield const *key = keylist;
1757 char eol = eolchar;
1758 size_t line_bytes = buf->line_bytes;
1759 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1761 if (buf->eof)
1762 return false;
1764 if (buf->used != buf->left)
1766 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1767 buf->used = buf->left;
1768 buf->nlines = 0;
1771 while (true)
1773 char *ptr = buf->buf + buf->used;
1774 struct line *linelim = buffer_linelim (buf);
1775 struct line *line = linelim - buf->nlines;
1776 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1777 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1779 while (line_bytes + 1 < avail)
1781 /* Read as many bytes as possible, but do not read so many
1782 bytes that there might not be enough room for the
1783 corresponding line array. The worst case is when the
1784 rest of the input file consists entirely of newlines,
1785 except that the last byte is not a newline. */
1786 size_t readsize = (avail - 1) / (line_bytes + 1);
1787 size_t bytes_read = fread (ptr, 1, readsize, fp);
1788 char *ptrlim = ptr + bytes_read;
1789 char *p;
1790 avail -= bytes_read;
1792 if (bytes_read != readsize)
1794 if (ferror (fp))
1795 sort_die (_("read failed"), file);
1796 if (feof (fp))
1798 buf->eof = true;
1799 if (buf->buf == ptrlim)
1800 return false;
1801 if (line_start != ptrlim && ptrlim[-1] != eol)
1802 *ptrlim++ = eol;
1806 /* Find and record each line in the just-read input. */
1807 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1809 /* Delimit the line with NUL. This eliminates the need to
1810 temporarily replace the last byte with NUL when calling
1811 xmemcoll, which increases performance. */
1812 *p = '\0';
1813 ptr = p + 1;
1814 line--;
1815 line->text = line_start;
1816 line->length = ptr - line_start;
1817 mergesize = MAX (mergesize, line->length);
1818 avail -= line_bytes;
1820 if (key)
1822 /* Precompute the position of the first key for
1823 efficiency. */
1824 line->keylim = (key->eword == SIZE_MAX
1826 : limfield (line, key));
1828 if (key->sword != SIZE_MAX)
1829 line->keybeg = begfield (line, key);
1830 else
1832 if (key->skipsblanks)
1833 while (blanks[to_uchar (*line_start)])
1834 line_start++;
1835 line->keybeg = line_start;
1839 line_start = ptr;
1842 ptr = ptrlim;
1843 if (buf->eof)
1844 break;
1847 buf->used = ptr - buf->buf;
1848 buf->nlines = buffer_linelim (buf) - line;
1849 if (buf->nlines != 0)
1851 buf->left = ptr - line_start;
1852 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1853 return true;
1857 /* The current input line is too long to fit in the buffer.
1858 Increase the buffer size and try again, keeping it properly
1859 aligned. */
1860 size_t line_alloc = buf->alloc / sizeof (struct line);
1861 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1862 buf->alloc = line_alloc * sizeof (struct line);
1867 /* Table that maps characters to order-of-magnitude values. */
1868 static char const unit_order[UCHAR_LIM] =
1870 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1871 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1872 /* This initializer syntax works on all C99 hosts. For now, use
1873 it only on non-ASCII hosts, to ease the pain of porting to
1874 pre-C99 ASCII hosts. */
1875 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1876 ['k']=1,
1877 #else
1878 /* Generate the following table with this command:
1879 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1880 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1881 |fmt */
1882 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1883 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1884 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1885 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1886 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1887 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1888 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1889 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1890 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1893 #endif
1896 /* Traverse number given as *number consisting of digits, thousands_sep, and
1897 decimal_point chars only. Returns the highest digit found in the number,
1898 or '\0' if no digit has been found. Upon return *number points at the
1899 character that immediately follows after the given number. */
1900 static unsigned char
1901 traverse_raw_number (char const **number)
1903 char const *p = *number;
1904 unsigned char ch;
1905 unsigned char max_digit = '\0';
1906 bool ends_with_thousands_sep = false;
1908 /* Scan to end of number.
1909 Decimals or separators not followed by digits stop the scan.
1910 Numbers ending in decimals or separators are thus considered
1911 to be lacking in units.
1912 FIXME: add support for multibyte thousands_sep and decimal_point. */
1914 while (ISDIGIT (ch = *p++))
1916 if (max_digit < ch)
1917 max_digit = ch;
1919 /* Allow to skip only one occurrence of thousands_sep to avoid finding
1920 the unit in the next column in case thousands_sep matches as blank
1921 and is used as column delimiter. */
1922 ends_with_thousands_sep = (*p == thousands_sep);
1923 if (ends_with_thousands_sep)
1924 ++p;
1927 if (ends_with_thousands_sep)
1929 /* thousands_sep not followed by digit is not allowed. */
1930 *number = p - 2;
1931 return max_digit;
1934 if (ch == decimal_point)
1935 while (ISDIGIT (ch = *p++))
1936 if (max_digit < ch)
1937 max_digit = ch;
1939 *number = p - 1;
1940 return max_digit;
1943 /* Return an integer that represents the order of magnitude of the
1944 unit following the number. The number may contain thousands
1945 separators and a decimal point, but it may not contain leading blanks.
1946 Negative numbers get negative orders; zero numbers have a zero order. */
1948 static int _GL_ATTRIBUTE_PURE
1949 find_unit_order (char const *number)
1951 bool minus_sign = (*number == '-');
1952 char const *p = number + minus_sign;
1953 unsigned char max_digit = traverse_raw_number (&p);
1954 if ('0' < max_digit)
1956 unsigned char ch = *p;
1957 int order = unit_order[ch];
1958 return (minus_sign ? -order : order);
1960 else
1961 return 0;
1964 /* Compare numbers A and B ending in units with SI or IEC prefixes
1965 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1967 static int
1968 human_numcompare (char const *a, char const *b)
1970 while (blanks[to_uchar (*a)])
1971 a++;
1972 while (blanks[to_uchar (*b)])
1973 b++;
1975 int diff = find_unit_order (a) - find_unit_order (b);
1976 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1979 /* Compare strings A and B as numbers without explicitly converting them to
1980 machine numbers. Comparatively slow for short strings, but asymptotically
1981 hideously fast. */
1983 static int
1984 numcompare (char const *a, char const *b)
1986 while (blanks[to_uchar (*a)])
1987 a++;
1988 while (blanks[to_uchar (*b)])
1989 b++;
1991 return strnumcmp (a, b, decimal_point, thousands_sep);
1994 /* Work around a problem whereby the long double value returned by glibc's
1995 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1996 A and B before calling strtold. FIXME: remove this function once
1997 gnulib guarantees that strtold's result is always well defined. */
1998 static int
1999 nan_compare (char const *sa, char const *sb)
2001 long_double a;
2002 memset (&a, 0, sizeof a);
2003 a = strtold (sa, NULL);
2005 long_double b;
2006 memset (&b, 0, sizeof b);
2007 b = strtold (sb, NULL);
2009 return memcmp (&a, &b, sizeof a);
2012 static int
2013 general_numcompare (char const *sa, char const *sb)
2015 /* FIXME: maybe add option to try expensive FP conversion
2016 only if A and B can't be compared more cheaply/accurately. */
2018 char *ea;
2019 char *eb;
2020 long_double a = strtold (sa, &ea);
2021 long_double b = strtold (sb, &eb);
2023 /* Put conversion errors at the start of the collating sequence. */
2024 if (sa == ea)
2025 return sb == eb ? 0 : -1;
2026 if (sb == eb)
2027 return 1;
2029 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
2030 conversion errors but before numbers; sort them by internal
2031 bit-pattern, for lack of a more portable alternative. */
2032 return (a < b ? -1
2033 : a > b ? 1
2034 : a == b ? 0
2035 : b == b ? -1
2036 : a == a ? 1
2037 : nan_compare (sa, sb));
2040 /* Return an integer in 1..12 of the month name MONTH.
2041 Return 0 if the name in S is not recognized. */
2043 static int
2044 getmonth (char const *month, char **ea)
2046 size_t lo = 0;
2047 size_t hi = MONTHS_PER_YEAR;
2049 while (blanks[to_uchar (*month)])
2050 month++;
2054 size_t ix = (lo + hi) / 2;
2055 char const *m = month;
2056 char const *n = monthtab[ix].name;
2058 for (;; m++, n++)
2060 if (!*n)
2062 if (ea)
2063 *ea = (char *) m;
2064 return monthtab[ix].val;
2066 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2068 hi = ix;
2069 break;
2071 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2073 lo = ix + 1;
2074 break;
2078 while (lo < hi);
2080 return 0;
2083 /* A randomly chosen MD5 state, used for random comparison. */
2084 static struct md5_ctx random_md5_state;
2086 /* Initialize the randomly chosen MD5 state. */
2088 static void
2089 random_md5_state_init (char const *random_source)
2091 unsigned char buf[MD5_DIGEST_SIZE];
2092 struct randread_source *r = randread_new (random_source, sizeof buf);
2093 if (! r)
2094 sort_die (_("open failed"), random_source);
2095 randread (r, buf, sizeof buf);
2096 if (randread_free (r) != 0)
2097 sort_die (_("close failed"), random_source);
2098 md5_init_ctx (&random_md5_state);
2099 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2102 /* This is like strxfrm, except it reports any error and exits. */
2104 static size_t
2105 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2107 errno = 0;
2108 size_t translated_size = strxfrm (dest, src, destsize);
2110 if (errno)
2112 error (0, errno, _("string transformation failed"));
2113 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2114 die (SORT_FAILURE, 0,
2115 _("the untransformed string was %s"),
2116 quotearg_n_style (0, locale_quoting_style, src));
2119 return translated_size;
2122 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2123 using one or more random hash functions. TEXTA[LENA] and
2124 TEXTB[LENB] must be zero. */
2126 static int
2127 compare_random (char *restrict texta, size_t lena,
2128 char *restrict textb, size_t lenb)
2130 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2131 data. This is used to break ties if there is a checksum
2132 collision, and this is good enough given the astronomically low
2133 probability of a collision. */
2134 int xfrm_diff = 0;
2136 char stackbuf[4000];
2137 char *buf = stackbuf;
2138 size_t bufsize = sizeof stackbuf;
2139 void *allocated = NULL;
2140 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2141 struct md5_ctx s[2];
2142 s[0] = s[1] = random_md5_state;
2144 if (hard_LC_COLLATE)
2146 char const *lima = texta + lena;
2147 char const *limb = textb + lenb;
2149 while (true)
2151 /* Transform the text into the basis of comparison, so that byte
2152 strings that would otherwise considered to be equal are
2153 considered equal here even if their bytes differ.
2155 Each time through this loop, transform one
2156 null-terminated string's worth from TEXTA or from TEXTB
2157 or both. That way, there's no need to store the
2158 transformation of the whole line, if it contains many
2159 null-terminated strings. */
2161 /* Store the transformed data into a big-enough buffer. */
2163 /* A 3X size guess avoids the overhead of calling strxfrm
2164 twice on typical implementations. Don't worry about
2165 size_t overflow, as the guess need not be correct. */
2166 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2167 if (bufsize < guess_bufsize)
2169 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2170 free (allocated);
2171 buf = allocated = malloc (bufsize);
2172 if (! buf)
2174 buf = stackbuf;
2175 bufsize = sizeof stackbuf;
2179 size_t sizea =
2180 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2181 bool a_fits = sizea <= bufsize;
2182 size_t sizeb =
2183 (textb < limb
2184 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2185 (a_fits ? bufsize - sizea : 0))
2186 + 1)
2187 : 0);
2189 if (! (a_fits && sizea + sizeb <= bufsize))
2191 bufsize = sizea + sizeb;
2192 if (bufsize < SIZE_MAX / 3)
2193 bufsize = bufsize * 3 / 2;
2194 free (allocated);
2195 buf = allocated = xmalloc (bufsize);
2196 if (texta < lima)
2197 strxfrm (buf, texta, sizea);
2198 if (textb < limb)
2199 strxfrm (buf + sizea, textb, sizeb);
2202 /* Advance past NULs to the next part of each input string,
2203 exiting the loop if both strings are exhausted. When
2204 exiting the loop, prepare to finish off the tiebreaker
2205 comparison properly. */
2206 if (texta < lima)
2207 texta += strlen (texta) + 1;
2208 if (textb < limb)
2209 textb += strlen (textb) + 1;
2210 if (! (texta < lima || textb < limb))
2212 lena = sizea; texta = buf;
2213 lenb = sizeb; textb = buf + sizea;
2214 break;
2217 /* Accumulate the transformed data in the corresponding
2218 checksums. */
2219 md5_process_bytes (buf, sizea, &s[0]);
2220 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2222 /* Update the tiebreaker comparison of the transformed data. */
2223 if (! xfrm_diff)
2225 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2226 if (! xfrm_diff)
2227 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2232 /* Compute and compare the checksums. */
2233 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2234 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2235 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2237 /* Fall back on the tiebreaker if the checksums collide. */
2238 if (! diff)
2240 if (! xfrm_diff)
2242 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2243 if (! xfrm_diff)
2244 xfrm_diff = (lena > lenb) - (lena < lenb);
2247 diff = xfrm_diff;
2250 free (allocated);
2252 return diff;
2255 /* Return the printable width of the block of memory starting at
2256 TEXT and ending just before LIM, counting each tab as one byte.
2257 FIXME: Should we generally be counting non printable chars? */
2259 static size_t
2260 debug_width (char const *text, char const *lim)
2262 size_t width = mbsnwidth (text, lim - text, 0);
2263 while (text < lim)
2264 width += (*text++ == '\t');
2265 return width;
2268 /* For debug mode, "underline" a key at the
2269 specified offset and screen width. */
2271 static void
2272 mark_key (size_t offset, size_t width)
2274 while (offset--)
2275 putchar (' ');
2277 if (!width)
2278 printf (_("^ no match for key\n"));
2279 else
2282 putchar ('_');
2283 while (--width);
2285 putchar ('\n');
2289 /* Return true if KEY is a numeric key. */
2291 static inline bool
2292 key_numeric (struct keyfield const *key)
2294 return key->numeric || key->general_numeric || key->human_numeric;
2297 /* For LINE, output a debugging line that underlines KEY in LINE.
2298 If KEY is null, underline the whole line. */
2300 static void
2301 debug_key (struct line const *line, struct keyfield const *key)
2303 char *text = line->text;
2304 char *beg = text;
2305 char *lim = text + line->length - 1;
2307 if (key)
2309 if (key->sword != SIZE_MAX)
2310 beg = begfield (line, key);
2311 if (key->eword != SIZE_MAX)
2312 lim = limfield (line, key);
2314 if ((key->skipsblanks && key->sword == SIZE_MAX)
2315 || key->month || key_numeric (key))
2317 char saved = *lim;
2318 *lim = '\0';
2320 while (blanks[to_uchar (*beg)])
2321 beg++;
2323 char *tighter_lim = beg;
2325 if (lim < beg)
2326 tighter_lim = lim;
2327 else if (key->month)
2328 getmonth (beg, &tighter_lim);
2329 else if (key->general_numeric)
2330 ignore_value (strtold (beg, &tighter_lim));
2331 else if (key->numeric || key->human_numeric)
2333 char const *p = beg + (beg < lim && *beg == '-');
2334 unsigned char max_digit = traverse_raw_number (&p);
2335 if ('0' <= max_digit)
2337 unsigned char ch = *p;
2338 tighter_lim = (char *) p
2339 + (key->human_numeric && unit_order[ch]);
2342 else
2343 tighter_lim = lim;
2345 *lim = saved;
2346 lim = tighter_lim;
2350 size_t offset = debug_width (text, beg);
2351 size_t width = debug_width (beg, lim);
2352 mark_key (offset, width);
2355 /* Debug LINE by underlining its keys. */
2357 static void
2358 debug_line (struct line const *line)
2360 struct keyfield const *key = keylist;
2363 debug_key (line, key);
2364 while (key && ((key = key->next) || ! (unique || stable)));
2367 /* Return whether sorting options specified for key. */
2369 static bool
2370 default_key_compare (struct keyfield const *key)
2372 return ! (key->ignore
2373 || key->translate
2374 || key->skipsblanks
2375 || key->skipeblanks
2376 || key_numeric (key)
2377 || key->month
2378 || key->version
2379 || key->random
2380 /* || key->reverse */
2384 /* Convert a key to the short options used to specify it. */
2386 static void
2387 key_to_opts (struct keyfield const *key, char *opts)
2389 if (key->skipsblanks || key->skipeblanks)
2390 *opts++ = 'b';/* either disables global -b */
2391 if (key->ignore == nondictionary)
2392 *opts++ = 'd';
2393 if (key->translate)
2394 *opts++ = 'f';
2395 if (key->general_numeric)
2396 *opts++ = 'g';
2397 if (key->human_numeric)
2398 *opts++ = 'h';
2399 if (key->ignore == nonprinting)
2400 *opts++ = 'i';
2401 if (key->month)
2402 *opts++ = 'M';
2403 if (key->numeric)
2404 *opts++ = 'n';
2405 if (key->random)
2406 *opts++ = 'R';
2407 if (key->reverse)
2408 *opts++ = 'r';
2409 if (key->version)
2410 *opts++ = 'V';
2411 *opts = '\0';
2414 /* Output data independent key warnings to stderr. */
2416 static void
2417 key_warnings (struct keyfield const *gkey, bool gkey_only)
2419 struct keyfield const *key;
2420 struct keyfield ugkey = *gkey;
2421 unsigned long keynum = 1;
2423 for (key = keylist; key; key = key->next, keynum++)
2425 if (key->traditional_used)
2427 size_t sword = key->sword;
2428 size_t eword = key->eword;
2429 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2430 /* obsolescent syntax +A.x -B.y is equivalent to:
2431 -k A+1.x+1,B.y (when y = 0)
2432 -k A+1.x+1,B+1.y (when y > 0) */
2433 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2434 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2435 char *po = obuf;
2436 char *pn = nbuf;
2438 if (sword == SIZE_MAX)
2439 sword++;
2441 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2442 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2443 if (key->eword != SIZE_MAX)
2445 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2446 stpcpy (stpcpy (pn, ","),
2447 umaxtostr (eword + 1
2448 + (key->echar == SIZE_MAX), tmp));
2450 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2451 quote_n (0, obuf), quote_n (1, nbuf));
2454 /* Warn about field specs that will never match. */
2455 bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword;
2456 if (zero_width)
2457 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2459 /* Warn about significant leading blanks. */
2460 bool implicit_skip = key_numeric (key) || key->month;
2461 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2462 if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
2463 && ((!key->skipsblanks && !implicit_skip)
2464 || (!key->skipsblanks && key->schar)
2465 || (!key->skipeblanks && key->echar)))
2466 error (0, 0, _("leading blanks are significant in key %lu; "
2467 "consider also specifying 'b'"), keynum);
2469 /* Warn about numeric comparisons spanning fields,
2470 as field delimiters could be interpreted as part
2471 of the number (maybe only in other locales). */
2472 if (!gkey_only && key_numeric (key))
2474 size_t sword = key->sword + 1;
2475 size_t eword = key->eword + 1;
2476 if (!sword)
2477 sword++;
2478 if (!eword || sword < eword)
2479 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2480 keynum);
2483 /* Flag global options not copied or specified in any key. */
2484 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2485 ugkey.ignore = NULL;
2486 if (ugkey.translate && (ugkey.translate == key->translate))
2487 ugkey.translate = NULL;
2488 ugkey.skipsblanks &= !key->skipsblanks;
2489 ugkey.skipeblanks &= !key->skipeblanks;
2490 ugkey.month &= !key->month;
2491 ugkey.numeric &= !key->numeric;
2492 ugkey.general_numeric &= !key->general_numeric;
2493 ugkey.human_numeric &= !key->human_numeric;
2494 ugkey.random &= !key->random;
2495 ugkey.version &= !key->version;
2496 ugkey.reverse &= !key->reverse;
2499 /* Warn about ignored global options flagged above.
2500 This clears all flags if UGKEY is the only one in the list. */
2501 if (!default_key_compare (&ugkey)
2502 || (ugkey.reverse && (stable || unique) && keylist))
2504 bool ugkey_reverse = ugkey.reverse;
2505 if (!(stable || unique))
2506 ugkey.reverse = false;
2507 /* The following is too big, but guaranteed to be "big enough". */
2508 char opts[sizeof short_options];
2509 key_to_opts (&ugkey, opts);
2510 error (0, 0,
2511 ngettext ("option '-%s' is ignored",
2512 "options '-%s' are ignored",
2513 select_plural (strlen (opts))), opts);
2514 ugkey.reverse = ugkey_reverse;
2516 if (ugkey.reverse && !(stable || unique) && keylist)
2517 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2520 /* Compare two lines A and B trying every key in sequence until there
2521 are no more keys or a difference is found. */
2523 static int
2524 keycompare (struct line const *a, struct line const *b)
2526 struct keyfield *key = keylist;
2528 /* For the first iteration only, the key positions have been
2529 precomputed for us. */
2530 char *texta = a->keybeg;
2531 char *textb = b->keybeg;
2532 char *lima = a->keylim;
2533 char *limb = b->keylim;
2535 int diff;
2537 while (true)
2539 char const *translate = key->translate;
2540 bool const *ignore = key->ignore;
2542 /* Treat field ends before field starts as empty fields. */
2543 lima = MAX (texta, lima);
2544 limb = MAX (textb, limb);
2546 /* Find the lengths. */
2547 size_t lena = lima - texta;
2548 size_t lenb = limb - textb;
2550 if (hard_LC_COLLATE || key_numeric (key)
2551 || key->month || key->random || key->version)
2553 char *ta;
2554 char *tb;
2555 size_t tlena;
2556 size_t tlenb;
2558 char enda IF_LINT (= 0);
2559 char endb IF_LINT (= 0);
2560 void *allocated IF_LINT (= NULL);
2561 char stackbuf[4000];
2563 if (ignore || translate)
2565 /* Compute with copies of the keys, which are the result of
2566 translating or ignoring characters, and which need their
2567 own storage. */
2569 size_t i;
2571 /* Allocate space for copies. */
2572 size_t size = lena + 1 + lenb + 1;
2573 if (size <= sizeof stackbuf)
2574 ta = stackbuf, allocated = NULL;
2575 else
2576 ta = allocated = xmalloc (size);
2577 tb = ta + lena + 1;
2579 /* Put into each copy a version of the key in which the
2580 requested characters are ignored or translated. */
2581 for (tlena = i = 0; i < lena; i++)
2582 if (! (ignore && ignore[to_uchar (texta[i])]))
2583 ta[tlena++] = (translate
2584 ? translate[to_uchar (texta[i])]
2585 : texta[i]);
2586 ta[tlena] = '\0';
2588 for (tlenb = i = 0; i < lenb; i++)
2589 if (! (ignore && ignore[to_uchar (textb[i])]))
2590 tb[tlenb++] = (translate
2591 ? translate[to_uchar (textb[i])]
2592 : textb[i]);
2593 tb[tlenb] = '\0';
2595 else
2597 /* Use the keys in-place, temporarily null-terminated. */
2598 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2599 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2602 if (key->numeric)
2603 diff = numcompare (ta, tb);
2604 else if (key->general_numeric)
2605 diff = general_numcompare (ta, tb);
2606 else if (key->human_numeric)
2607 diff = human_numcompare (ta, tb);
2608 else if (key->month)
2609 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2610 else if (key->random)
2611 diff = compare_random (ta, tlena, tb, tlenb);
2612 else if (key->version)
2613 diff = filevercmp (ta, tb);
2614 else
2616 /* Locale-dependent string sorting. This is slower than
2617 C-locale sorting, which is implemented below. */
2618 if (tlena == 0)
2619 diff = - NONZERO (tlenb);
2620 else if (tlenb == 0)
2621 diff = 1;
2622 else
2623 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2626 if (ignore || translate)
2627 free (allocated);
2628 else
2630 ta[tlena] = enda;
2631 tb[tlenb] = endb;
2634 else if (ignore)
2636 #define CMP_WITH_IGNORE(A, B) \
2637 do \
2639 while (true) \
2641 while (texta < lima && ignore[to_uchar (*texta)]) \
2642 ++texta; \
2643 while (textb < limb && ignore[to_uchar (*textb)]) \
2644 ++textb; \
2645 if (! (texta < lima && textb < limb)) \
2646 break; \
2647 diff = to_uchar (A) - to_uchar (B); \
2648 if (diff) \
2649 goto not_equal; \
2650 ++texta; \
2651 ++textb; \
2654 diff = (texta < lima) - (textb < limb); \
2656 while (0)
2658 if (translate)
2659 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2660 translate[to_uchar (*textb)]);
2661 else
2662 CMP_WITH_IGNORE (*texta, *textb);
2664 else if (lena == 0)
2665 diff = - NONZERO (lenb);
2666 else if (lenb == 0)
2667 goto greater;
2668 else
2670 if (translate)
2672 while (texta < lima && textb < limb)
2674 diff = (to_uchar (translate[to_uchar (*texta++)])
2675 - to_uchar (translate[to_uchar (*textb++)]));
2676 if (diff)
2677 goto not_equal;
2680 else
2682 diff = memcmp (texta, textb, MIN (lena, lenb));
2683 if (diff)
2684 goto not_equal;
2686 diff = lena < lenb ? -1 : lena != lenb;
2689 if (diff)
2690 goto not_equal;
2692 key = key->next;
2693 if (! key)
2694 break;
2696 /* Find the beginning and limit of the next field. */
2697 if (key->eword != SIZE_MAX)
2698 lima = limfield (a, key), limb = limfield (b, key);
2699 else
2700 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2702 if (key->sword != SIZE_MAX)
2703 texta = begfield (a, key), textb = begfield (b, key);
2704 else
2706 texta = a->text, textb = b->text;
2707 if (key->skipsblanks)
2709 while (texta < lima && blanks[to_uchar (*texta)])
2710 ++texta;
2711 while (textb < limb && blanks[to_uchar (*textb)])
2712 ++textb;
2717 return 0;
2719 greater:
2720 diff = 1;
2721 not_equal:
2722 return key->reverse ? -diff : diff;
2725 /* Compare two lines A and B, returning negative, zero, or positive
2726 depending on whether A compares less than, equal to, or greater than B. */
2728 static int
2729 compare (struct line const *a, struct line const *b)
2731 int diff;
2732 size_t alen, blen;
2734 /* First try to compare on the specified keys (if any).
2735 The only two cases with no key at all are unadorned sort,
2736 and unadorned sort -r. */
2737 if (keylist)
2739 diff = keycompare (a, b);
2740 if (diff || unique || stable)
2741 return diff;
2744 /* If the keys all compare equal (or no keys were specified)
2745 fall through to the default comparison. */
2746 alen = a->length - 1, blen = b->length - 1;
2748 if (alen == 0)
2749 diff = - NONZERO (blen);
2750 else if (blen == 0)
2751 diff = 1;
2752 else if (hard_LC_COLLATE)
2754 /* xmemcoll0 is a performance enhancement as
2755 it will not unconditionally write '\0' after the
2756 passed in buffers, which was seen to give around
2757 a 3% increase in performance for short lines. */
2758 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2760 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2761 diff = alen < blen ? -1 : alen != blen;
2763 return reverse ? -diff : diff;
2766 /* Write LINE to output stream FP; the output file's name is
2767 OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
2768 otherwise. If debugging is enabled and FP is standard output,
2769 append some debugging information. */
2771 static void
2772 write_line (struct line const *line, FILE *fp, char const *output_file)
2774 char *buf = line->text;
2775 size_t n_bytes = line->length;
2776 char *ebuf = buf + n_bytes;
2778 if (!output_file && debug)
2780 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2781 char const *c = buf;
2783 while (c < ebuf)
2785 char wc = *c++;
2786 if (wc == '\t')
2787 wc = '>';
2788 else if (c == ebuf)
2789 wc = '\n';
2790 if (fputc (wc, fp) == EOF)
2791 sort_die (_("write failed"), output_file);
2794 debug_line (line);
2796 else
2798 ebuf[-1] = eolchar;
2799 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2800 sort_die (_("write failed"), output_file);
2801 ebuf[-1] = '\0';
2805 /* Check that the lines read from FILE_NAME come in order. Return
2806 true if they are in order. If CHECKONLY == 'c', also print a
2807 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2808 they are not in order. */
2810 static bool
2811 check (char const *file_name, char checkonly)
2813 FILE *fp = xfopen (file_name, "r");
2814 struct buffer buf; /* Input buffer. */
2815 struct line temp; /* Copy of previous line. */
2816 size_t alloc = 0;
2817 uintmax_t line_number = 0;
2818 struct keyfield const *key = keylist;
2819 bool nonunique = ! unique;
2820 bool ordered = true;
2822 initbuf (&buf, sizeof (struct line),
2823 MAX (merge_buffer_size, sort_size));
2824 temp.text = NULL;
2826 while (fillbuf (&buf, fp, file_name))
2828 struct line const *line = buffer_linelim (&buf);
2829 struct line const *linebase = line - buf.nlines;
2831 /* Make sure the line saved from the old buffer contents is
2832 less than or equal to the first line of the new buffer. */
2833 if (alloc && nonunique <= compare (&temp, line - 1))
2835 found_disorder:
2837 if (checkonly == 'c')
2839 struct line const *disorder_line = line - 1;
2840 uintmax_t disorder_line_number =
2841 buffer_linelim (&buf) - disorder_line + line_number;
2842 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2843 fprintf (stderr, _("%s: %s:%s: disorder: "),
2844 program_name, file_name,
2845 umaxtostr (disorder_line_number, hr_buf));
2846 write_line (disorder_line, stderr, _("standard error"));
2849 ordered = false;
2850 break;
2854 /* Compare each line in the buffer with its successor. */
2855 while (linebase < --line)
2856 if (nonunique <= compare (line, line - 1))
2857 goto found_disorder;
2859 line_number += buf.nlines;
2861 /* Save the last line of the buffer. */
2862 if (alloc < line->length)
2866 alloc *= 2;
2867 if (! alloc)
2869 alloc = line->length;
2870 break;
2873 while (alloc < line->length);
2875 free (temp.text);
2876 temp.text = xmalloc (alloc);
2878 memcpy (temp.text, line->text, line->length);
2879 temp.length = line->length;
2880 if (key)
2882 temp.keybeg = temp.text + (line->keybeg - line->text);
2883 temp.keylim = temp.text + (line->keylim - line->text);
2887 xfclose (fp, file_name);
2888 free (buf.buf);
2889 free (temp.text);
2890 return ordered;
2893 /* Open FILES (there are NFILES of them) and store the resulting array
2894 of stream pointers into (*PFPS). Allocate the array. Return the
2895 number of successfully opened files, setting errno if this value is
2896 less than NFILES. */
2898 static size_t
2899 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2901 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2902 int i;
2904 /* Open as many input files as we can. */
2905 for (i = 0; i < nfiles; i++)
2907 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2908 ? open_temp (files[i].temp)
2909 : stream_open (files[i].name, "r"));
2910 if (!fps[i])
2911 break;
2914 return i;
2917 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2918 files (all of which are at the start of the FILES array), and
2919 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2920 FPS is the vector of open stream corresponding to the files.
2921 Close input and output streams before returning.
2922 OUTPUT_FILE gives the name of the output file. If it is NULL,
2923 the output file is standard output. */
2925 static void
2926 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2927 FILE *ofp, char const *output_file, FILE **fps)
2929 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2930 /* Input buffers for each file. */
2931 struct line saved; /* Saved line storage for unique check. */
2932 struct line const *savedline = NULL;
2933 /* &saved if there is a saved line. */
2934 size_t savealloc = 0; /* Size allocated for the saved line. */
2935 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2936 /* Current line in each line table. */
2937 struct line const **base = xnmalloc (nfiles, sizeof *base);
2938 /* Base of each line table. */
2939 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2940 /* Table representing a permutation of fps,
2941 such that cur[ord[0]] is the smallest line
2942 and will be next output. */
2943 size_t i;
2944 size_t j;
2945 size_t t;
2946 struct keyfield const *key = keylist;
2947 saved.text = NULL;
2949 /* Read initial lines from each input file. */
2950 for (i = 0; i < nfiles; )
2952 initbuf (&buffer[i], sizeof (struct line),
2953 MAX (merge_buffer_size, sort_size / nfiles));
2954 if (fillbuf (&buffer[i], fps[i], files[i].name))
2956 struct line const *linelim = buffer_linelim (&buffer[i]);
2957 cur[i] = linelim - 1;
2958 base[i] = linelim - buffer[i].nlines;
2959 i++;
2961 else
2963 /* fps[i] is empty; eliminate it from future consideration. */
2964 xfclose (fps[i], files[i].name);
2965 if (i < ntemps)
2967 ntemps--;
2968 zaptemp (files[i].name);
2970 free (buffer[i].buf);
2971 --nfiles;
2972 for (j = i; j < nfiles; ++j)
2974 files[j] = files[j + 1];
2975 fps[j] = fps[j + 1];
2980 /* Set up the ord table according to comparisons among input lines.
2981 Since this only reorders two items if one is strictly greater than
2982 the other, it is stable. */
2983 for (i = 0; i < nfiles; ++i)
2984 ord[i] = i;
2985 for (i = 1; i < nfiles; ++i)
2986 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2987 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2989 /* Repeatedly output the smallest line until no input remains. */
2990 while (nfiles)
2992 struct line const *smallest = cur[ord[0]];
2994 /* If uniquified output is turned on, output only the first of
2995 an identical series of lines. */
2996 if (unique)
2998 if (savedline && compare (savedline, smallest))
3000 savedline = NULL;
3001 write_line (&saved, ofp, output_file);
3003 if (!savedline)
3005 savedline = &saved;
3006 if (savealloc < smallest->length)
3009 if (! savealloc)
3011 savealloc = smallest->length;
3012 break;
3014 while ((savealloc *= 2) < smallest->length);
3016 free (saved.text);
3017 saved.text = xmalloc (savealloc);
3019 saved.length = smallest->length;
3020 memcpy (saved.text, smallest->text, saved.length);
3021 if (key)
3023 saved.keybeg =
3024 saved.text + (smallest->keybeg - smallest->text);
3025 saved.keylim =
3026 saved.text + (smallest->keylim - smallest->text);
3030 else
3031 write_line (smallest, ofp, output_file);
3033 /* Check if we need to read more lines into core. */
3034 if (base[ord[0]] < smallest)
3035 cur[ord[0]] = smallest - 1;
3036 else
3038 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
3040 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
3041 cur[ord[0]] = linelim - 1;
3042 base[ord[0]] = linelim - buffer[ord[0]].nlines;
3044 else
3046 /* We reached EOF on fps[ord[0]]. */
3047 for (i = 1; i < nfiles; ++i)
3048 if (ord[i] > ord[0])
3049 --ord[i];
3050 --nfiles;
3051 xfclose (fps[ord[0]], files[ord[0]].name);
3052 if (ord[0] < ntemps)
3054 ntemps--;
3055 zaptemp (files[ord[0]].name);
3057 free (buffer[ord[0]].buf);
3058 for (i = ord[0]; i < nfiles; ++i)
3060 fps[i] = fps[i + 1];
3061 files[i] = files[i + 1];
3062 buffer[i] = buffer[i + 1];
3063 cur[i] = cur[i + 1];
3064 base[i] = base[i + 1];
3066 for (i = 0; i < nfiles; ++i)
3067 ord[i] = ord[i + 1];
3068 continue;
3072 /* The new line just read in may be larger than other lines
3073 already in main memory; push it back in the queue until we
3074 encounter a line larger than it. Optimize for the common
3075 case where the new line is smallest. */
3077 size_t lo = 1;
3078 size_t hi = nfiles;
3079 size_t probe = lo;
3080 size_t ord0 = ord[0];
3081 size_t count_of_smaller_lines;
3083 while (lo < hi)
3085 int cmp = compare (cur[ord0], cur[ord[probe]]);
3086 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3087 hi = probe;
3088 else
3089 lo = probe + 1;
3090 probe = (lo + hi) / 2;
3093 count_of_smaller_lines = lo - 1;
3094 for (j = 0; j < count_of_smaller_lines; j++)
3095 ord[j] = ord[j + 1];
3096 ord[count_of_smaller_lines] = ord0;
3100 if (unique && savedline)
3102 write_line (&saved, ofp, output_file);
3103 free (saved.text);
3106 xfclose (ofp, output_file);
3107 free (fps);
3108 free (buffer);
3109 free (ord);
3110 free (base);
3111 free (cur);
3114 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3115 files (all of which are at the start of the FILES array), and
3116 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3117 Close input and output files before returning.
3118 OUTPUT_FILE gives the name of the output file.
3120 Return the number of files successfully merged. This number can be
3121 less than NFILES if we ran low on file descriptors, but in this
3122 case it is never less than 2. */
3124 static size_t
3125 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3126 FILE *ofp, char const *output_file)
3128 FILE **fps;
3129 size_t nopened = open_input_files (files, nfiles, &fps);
3130 if (nopened < nfiles && nopened < 2)
3131 sort_die (_("open failed"), files[nopened].name);
3132 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3133 return nopened;
3136 /* Merge into T (of size NLINES) the two sorted arrays of lines
3137 LO (with NLINES / 2 members), and
3138 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3139 T and LO point just past their respective arrays, and the arrays
3140 are in reverse order. NLINES must be at least 2. */
3142 static void
3143 mergelines (struct line *restrict t, size_t nlines,
3144 struct line const *restrict lo)
3146 size_t nlo = nlines / 2;
3147 size_t nhi = nlines - nlo;
3148 struct line *hi = t - nlo;
3150 while (true)
3151 if (compare (lo - 1, hi - 1) <= 0)
3153 *--t = *--lo;
3154 if (! --nlo)
3156 /* HI must equal T now, and there is no need to copy from
3157 HI to T. */
3158 return;
3161 else
3163 *--t = *--hi;
3164 if (! --nhi)
3167 *--t = *--lo;
3168 while (--nlo);
3170 return;
3175 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3176 Do this all within one thread. NLINES must be at least 2.
3177 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3178 Otherwise the sort is in-place and TEMP is half-sized.
3179 The input and output arrays are in reverse order, and LINES and
3180 TEMP point just past the end of their respective arrays.
3182 Use a recursive divide-and-conquer algorithm, in the style
3183 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3184 the optimization suggested by exercise 5.2.4-10; this requires room
3185 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3186 writes that this memory optimization was originally published by
3187 D. A. Bell, Comp J. 1 (1958), 75. */
3189 static void
3190 sequential_sort (struct line *restrict lines, size_t nlines,
3191 struct line *restrict temp, bool to_temp)
3193 if (nlines == 2)
3195 /* Declare 'swap' as int, not bool, to work around a bug
3196 <https://lists.gnu.org/r/bug-coreutils/2005-10/msg00086.html>
3197 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3198 int swap = (0 < compare (&lines[-1], &lines[-2]));
3199 if (to_temp)
3201 temp[-1] = lines[-1 - swap];
3202 temp[-2] = lines[-2 + swap];
3204 else if (swap)
3206 temp[-1] = lines[-1];
3207 lines[-1] = lines[-2];
3208 lines[-2] = temp[-1];
3211 else
3213 size_t nlo = nlines / 2;
3214 size_t nhi = nlines - nlo;
3215 struct line *lo = lines;
3216 struct line *hi = lines - nlo;
3218 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3219 if (1 < nlo)
3220 sequential_sort (lo, nlo, temp, !to_temp);
3221 else if (!to_temp)
3222 temp[-1] = lo[-1];
3224 struct line *dest;
3225 struct line const *sorted_lo;
3226 if (to_temp)
3228 dest = temp;
3229 sorted_lo = lines;
3231 else
3233 dest = lines;
3234 sorted_lo = temp;
3236 mergelines (dest, nlines, sorted_lo);
3240 static struct merge_node *init_node (struct merge_node *restrict,
3241 struct merge_node *restrict,
3242 struct line *, size_t, size_t, bool);
3245 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3246 lines, with destination DEST. */
3247 static struct merge_node *
3248 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3250 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3252 struct merge_node *root = merge_tree;
3253 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3254 root->dest = NULL;
3255 root->nlo = root->nhi = nlines;
3256 root->parent = NULL;
3257 root->level = MERGE_END;
3258 root->queued = false;
3259 pthread_mutex_init (&root->lock, NULL);
3261 init_node (root, root + 1, dest, nthreads, nlines, false);
3262 return merge_tree;
3265 /* Destroy the merge tree. */
3266 static void
3267 merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
3269 size_t n_nodes = nthreads * 2;
3270 struct merge_node *node = merge_tree;
3272 while (n_nodes--)
3274 pthread_mutex_destroy (&node->lock);
3275 node++;
3278 free (merge_tree);
3281 /* Initialize a merge tree node and its descendants. The node's
3282 parent is PARENT. The node and its descendants are taken from the
3283 array of nodes NODE_POOL. Their destination starts at DEST; they
3284 will consume NTHREADS threads. The total number of sort lines is
3285 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3286 its parent. */
3288 static struct merge_node *
3289 init_node (struct merge_node *restrict parent,
3290 struct merge_node *restrict node_pool,
3291 struct line *dest, size_t nthreads,
3292 size_t total_lines, bool is_lo_child)
3294 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3295 size_t nlo = nlines / 2;
3296 size_t nhi = nlines - nlo;
3297 struct line *lo = dest - total_lines;
3298 struct line *hi = lo - nlo;
3299 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3301 struct merge_node *node = node_pool++;
3302 node->lo = node->end_lo = lo;
3303 node->hi = node->end_hi = hi;
3304 node->dest = parent_end;
3305 node->nlo = nlo;
3306 node->nhi = nhi;
3307 node->parent = parent;
3308 node->level = parent->level + 1;
3309 node->queued = false;
3310 pthread_mutex_init (&node->lock, NULL);
3312 if (nthreads > 1)
3314 size_t lo_threads = nthreads / 2;
3315 size_t hi_threads = nthreads - lo_threads;
3316 node->lo_child = node_pool;
3317 node_pool = init_node (node, node_pool, lo, lo_threads,
3318 total_lines, true);
3319 node->hi_child = node_pool;
3320 node_pool = init_node (node, node_pool, hi, hi_threads,
3321 total_lines, false);
3323 else
3325 node->lo_child = NULL;
3326 node->hi_child = NULL;
3328 return node_pool;
3332 /* Compare two merge nodes A and B for priority. */
3334 static int
3335 compare_nodes (void const *a, void const *b)
3337 struct merge_node const *nodea = a;
3338 struct merge_node const *nodeb = b;
3339 if (nodea->level == nodeb->level)
3340 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3341 return nodea->level < nodeb->level;
3344 /* Lock a merge tree NODE. */
3346 static inline void
3347 lock_node (struct merge_node *node)
3349 pthread_mutex_lock (&node->lock);
3352 /* Unlock a merge tree NODE. */
3354 static inline void
3355 unlock_node (struct merge_node *node)
3357 pthread_mutex_unlock (&node->lock);
3360 /* Destroy merge QUEUE. */
3362 static void
3363 queue_destroy (struct merge_node_queue *queue)
3365 heap_free (queue->priority_queue);
3366 pthread_cond_destroy (&queue->cond);
3367 pthread_mutex_destroy (&queue->mutex);
3370 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3371 NTHREADS threads. */
3373 static void
3374 queue_init (struct merge_node_queue *queue, size_t nthreads)
3376 /* Though it's highly unlikely all nodes are in the heap at the same
3377 time, the heap should accommodate all of them. Counting a NULL
3378 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3379 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3380 pthread_mutex_init (&queue->mutex, NULL);
3381 pthread_cond_init (&queue->cond, NULL);
3384 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3385 does not need to lock NODE. */
3387 static void
3388 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3390 pthread_mutex_lock (&queue->mutex);
3391 heap_insert (queue->priority_queue, node);
3392 node->queued = true;
3393 pthread_cond_signal (&queue->cond);
3394 pthread_mutex_unlock (&queue->mutex);
3397 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3399 static struct merge_node *
3400 queue_pop (struct merge_node_queue *queue)
3402 struct merge_node *node;
3403 pthread_mutex_lock (&queue->mutex);
3404 while (! (node = heap_remove_top (queue->priority_queue)))
3405 pthread_cond_wait (&queue->cond, &queue->mutex);
3406 pthread_mutex_unlock (&queue->mutex);
3407 lock_node (node);
3408 node->queued = false;
3409 return node;
3412 /* Output LINE to TFP, unless -u is specified and the line compares
3413 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3414 is null if TFP is standard output.
3416 This function does not save the line for comparison later, so it is
3417 appropriate only for internal sort. */
3419 static void
3420 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3422 if (unique)
3424 if (saved_line.text && ! compare (line, &saved_line))
3425 return;
3426 saved_line = *line;
3429 write_line (line, tfp, temp_output);
3432 /* Merge the lines currently available to a NODE in the binary
3433 merge tree. Merge a number of lines appropriate for this merge
3434 level, assuming TOTAL_LINES is the total number of lines.
3436 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3437 the name of TFP, or is null if TFP is standard output. */
3439 static void
3440 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3441 FILE *tfp, char const *temp_output)
3443 struct line *lo_orig = node->lo;
3444 struct line *hi_orig = node->hi;
3445 size_t to_merge = MAX_MERGE (total_lines, node->level);
3446 size_t merged_lo;
3447 size_t merged_hi;
3449 if (node->level > MERGE_ROOT)
3451 /* Merge to destination buffer. */
3452 struct line *dest = *node->dest;
3453 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3454 if (compare (node->lo - 1, node->hi - 1) <= 0)
3455 *--dest = *--node->lo;
3456 else
3457 *--dest = *--node->hi;
3459 merged_lo = lo_orig - node->lo;
3460 merged_hi = hi_orig - node->hi;
3462 if (node->nhi == merged_hi)
3463 while (node->lo != node->end_lo && to_merge--)
3464 *--dest = *--node->lo;
3465 else if (node->nlo == merged_lo)
3466 while (node->hi != node->end_hi && to_merge--)
3467 *--dest = *--node->hi;
3468 *node->dest = dest;
3470 else
3472 /* Merge directly to output. */
3473 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3475 if (compare (node->lo - 1, node->hi - 1) <= 0)
3476 write_unique (--node->lo, tfp, temp_output);
3477 else
3478 write_unique (--node->hi, tfp, temp_output);
3481 merged_lo = lo_orig - node->lo;
3482 merged_hi = hi_orig - node->hi;
3484 if (node->nhi == merged_hi)
3486 while (node->lo != node->end_lo && to_merge--)
3487 write_unique (--node->lo, tfp, temp_output);
3489 else if (node->nlo == merged_lo)
3491 while (node->hi != node->end_hi && to_merge--)
3492 write_unique (--node->hi, tfp, temp_output);
3496 /* Update NODE. */
3497 merged_lo = lo_orig - node->lo;
3498 merged_hi = hi_orig - node->hi;
3499 node->nlo -= merged_lo;
3500 node->nhi -= merged_hi;
3503 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3504 NODE's children has available lines and the other either has
3505 available lines or has exhausted its lines. */
3507 static void
3508 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3510 if (! node->queued)
3512 bool lo_avail = (node->lo - node->end_lo) != 0;
3513 bool hi_avail = (node->hi - node->end_hi) != 0;
3514 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3515 queue_insert (queue, node);
3519 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3521 static void
3522 queue_check_insert_parent (struct merge_node_queue *queue,
3523 struct merge_node *node)
3525 if (node->level > MERGE_ROOT)
3527 lock_node (node->parent);
3528 queue_check_insert (queue, node->parent);
3529 unlock_node (node->parent);
3531 else if (node->nlo + node->nhi == 0)
3533 /* If the MERGE_ROOT NODE has finished merging, insert the
3534 MERGE_END node. */
3535 queue_insert (queue, node->parent);
3539 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3540 some of those lines, until the MERGE_END node is popped.
3541 TOTAL_LINES is the total number of lines. If merging at the top
3542 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3543 null if TFP is standard output. */
3545 static void
3546 merge_loop (struct merge_node_queue *queue,
3547 size_t total_lines, FILE *tfp, char const *temp_output)
3549 while (1)
3551 struct merge_node *node = queue_pop (queue);
3553 if (node->level == MERGE_END)
3555 unlock_node (node);
3556 /* Reinsert so other threads can pop it. */
3557 queue_insert (queue, node);
3558 break;
3560 mergelines_node (node, total_lines, tfp, temp_output);
3561 queue_check_insert (queue, node);
3562 queue_check_insert_parent (queue, node);
3564 unlock_node (node);
3569 static void sortlines (struct line *restrict, size_t, size_t,
3570 struct merge_node *, struct merge_node_queue *,
3571 FILE *, char const *);
3573 /* Thread arguments for sortlines_thread. */
3575 struct thread_args
3577 /* Source, i.e., the array of lines to sort. This points just past
3578 the end of the array. */
3579 struct line *lines;
3581 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3582 size_t nthreads;
3584 /* Number of lines in LINES and DEST. */
3585 size_t const total_lines;
3587 /* Merge node. Lines from this node and this node's sibling will merged
3588 to this node's parent. */
3589 struct merge_node *const node;
3591 /* The priority queue controlling available work for the entire
3592 internal sort. */
3593 struct merge_node_queue *const queue;
3595 /* If at the top level, the file to output to, and the file's name.
3596 If the file is standard output, the file's name is null. */
3597 FILE *tfp;
3598 char const *output_temp;
3601 /* Like sortlines, except with a signature acceptable to pthread_create. */
3603 static void *
3604 sortlines_thread (void *data)
3606 struct thread_args const *args = data;
3607 sortlines (args->lines, args->nthreads, args->total_lines,
3608 args->node, args->queue, args->tfp,
3609 args->output_temp);
3610 return NULL;
3613 /* Sort lines, possibly in parallel. The arguments are as in struct
3614 thread_args above.
3616 The algorithm has three phases: node creation, sequential sort,
3617 and binary merge.
3619 During node creation, sortlines recursively visits each node in the
3620 binary merge tree and creates a NODE structure corresponding to all the
3621 future line merging NODE is responsible for. For each call to
3622 sortlines, half the available threads are assigned to each recursive
3623 call, until a leaf node having only 1 available thread is reached.
3625 Each leaf node then performs two sequential sorts, one on each half of
3626 the lines it is responsible for. It records in its NODE structure that
3627 there are two sorted sublists available to merge from, and inserts its
3628 NODE into the priority queue.
3630 The binary merge phase then begins. Each thread drops into a loop
3631 where the thread retrieves a NODE from the priority queue, merges lines
3632 available to that NODE, and potentially insert NODE or its parent back
3633 into the queue if there are sufficient available lines for them to
3634 merge. This continues until all lines at all nodes of the merge tree
3635 have been merged. */
3637 static void
3638 sortlines (struct line *restrict lines, size_t nthreads,
3639 size_t total_lines, struct merge_node *node,
3640 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3642 size_t nlines = node->nlo + node->nhi;
3644 /* Calculate thread arguments. */
3645 size_t lo_threads = nthreads / 2;
3646 size_t hi_threads = nthreads - lo_threads;
3647 pthread_t thread;
3648 struct thread_args args = {lines, lo_threads, total_lines,
3649 node->lo_child, queue, tfp, temp_output};
3651 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3652 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3654 sortlines (lines - node->nlo, hi_threads, total_lines,
3655 node->hi_child, queue, tfp, temp_output);
3656 pthread_join (thread, NULL);
3658 else
3660 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3661 Sort with 1 thread. */
3662 size_t nlo = node->nlo;
3663 size_t nhi = node->nhi;
3664 struct line *temp = lines - total_lines;
3665 if (1 < nhi)
3666 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3667 if (1 < nlo)
3668 sequential_sort (lines, nlo, temp, false);
3670 /* Update merge NODE. No need to lock yet. */
3671 node->lo = lines;
3672 node->hi = lines - nlo;
3673 node->end_lo = lines - nlo;
3674 node->end_hi = lines - nlo - nhi;
3676 queue_insert (queue, node);
3677 merge_loop (queue, total_lines, tfp, temp_output);
3681 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3682 the same as OUTFILE. If found, replace each with the same
3683 temporary copy that can be merged into OUTFILE without destroying
3684 OUTFILE before it is completely read. This temporary copy does not
3685 count as a merge temp, so don't worry about incrementing NTEMPS in
3686 the caller; final cleanup will remove it, not zaptemp.
3688 This test ensures that an otherwise-erroneous use like
3689 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3690 It's not clear that POSIX requires this nicety.
3691 Detect common error cases, but don't try to catch obscure cases like
3692 "cat ... FILE ... | sort -m -o FILE"
3693 where traditional "sort" doesn't copy the input and where
3694 people should know that they're getting into trouble anyway.
3695 Catching these obscure cases would slow down performance in
3696 common cases. */
3698 static void
3699 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3700 size_t nfiles, char const *outfile)
3702 bool got_outstat = false;
3703 struct stat outstat;
3704 struct tempnode *tempcopy = NULL;
3706 for (size_t i = ntemps; i < nfiles; i++)
3708 bool is_stdin = STREQ (files[i].name, "-");
3709 bool same;
3710 struct stat instat;
3712 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3713 same = true;
3714 else
3716 if (! got_outstat)
3718 if (fstat (STDOUT_FILENO, &outstat) != 0)
3719 break;
3720 got_outstat = true;
3723 same = (((is_stdin
3724 ? fstat (STDIN_FILENO, &instat)
3725 : stat (files[i].name, &instat))
3726 == 0)
3727 && SAME_INODE (instat, outstat));
3730 if (same)
3732 if (! tempcopy)
3734 FILE *tftp;
3735 tempcopy = create_temp (&tftp);
3736 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3739 files[i].name = tempcopy->name;
3740 files[i].temp = tempcopy;
3745 /* Scan the input files to ensure all are accessible.
3746 Otherwise exit with a diagnostic.
3748 This will catch common issues with permissions etc.
3749 but will fail to notice issues where you can open but not read,
3750 like when a directory is specified on some systems.
3751 Catching these obscure cases could slow down performance in
3752 common cases. */
3754 static void
3755 check_inputs (char *const *files, size_t nfiles)
3757 for (size_t i = 0; i < nfiles; i++)
3759 if (STREQ (files[i], "-"))
3760 continue;
3762 if (euidaccess (files[i], R_OK) != 0)
3763 sort_die (_("cannot read"), files[i]);
3767 /* Ensure a specified output file can be created or written to,
3768 and point stdout to it. Do not truncate the file.
3769 Exit with a diagnostic on failure. */
3771 static void
3772 check_output (char const *outfile)
3774 if (outfile)
3776 int oflags = O_WRONLY | O_BINARY | O_CLOEXEC | O_CREAT;
3777 int outfd = open (outfile, oflags, MODE_RW_UGO);
3778 if (outfd < 0)
3779 sort_die (_("open failed"), outfile);
3780 move_fd (outfd, STDOUT_FILENO);
3784 /* Merge the input FILES. NTEMPS is the number of files at the
3785 start of FILES that are temporary; it is zero at the top level.
3786 NFILES is the total number of files. Put the output in
3787 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3789 static void
3790 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3791 char const *output_file)
3793 while (nmerge < nfiles)
3795 /* Number of input files processed so far. */
3796 size_t in;
3798 /* Number of output files generated so far. */
3799 size_t out;
3801 /* nfiles % NMERGE; this counts input files that are left over
3802 after all full-sized merges have been done. */
3803 size_t remainder;
3805 /* Number of easily-available slots at the next loop iteration. */
3806 size_t cheap_slots;
3808 /* Do as many NMERGE-size merges as possible. In the case that
3809 nmerge is bogus, increment by the maximum number of file
3810 descriptors allowed. */
3811 for (out = in = 0; nmerge <= nfiles - in; out++)
3813 FILE *tfp;
3814 struct tempnode *temp = create_temp (&tfp);
3815 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3816 nmerge, tfp, temp->name);
3817 ntemps -= MIN (ntemps, num_merged);
3818 files[out].name = temp->name;
3819 files[out].temp = temp;
3820 in += num_merged;
3823 remainder = nfiles - in;
3824 cheap_slots = nmerge - out % nmerge;
3826 if (cheap_slots < remainder)
3828 /* So many files remain that they can't all be put into the last
3829 NMERGE-sized output window. Do one more merge. Merge as few
3830 files as possible, to avoid needless I/O. */
3831 size_t nshortmerge = remainder - cheap_slots + 1;
3832 FILE *tfp;
3833 struct tempnode *temp = create_temp (&tfp);
3834 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3835 nshortmerge, tfp, temp->name);
3836 ntemps -= MIN (ntemps, num_merged);
3837 files[out].name = temp->name;
3838 files[out++].temp = temp;
3839 in += num_merged;
3842 /* Put the remaining input files into the last NMERGE-sized output
3843 window, so they will be merged in the next pass. */
3844 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3845 ntemps += out;
3846 nfiles -= in - out;
3849 avoid_trashing_input (files, ntemps, nfiles, output_file);
3851 /* We aren't guaranteed that this final mergefiles will work, therefore we
3852 try to merge into the output, and then merge as much as we can into a
3853 temp file if we can't. Repeat. */
3855 while (true)
3857 /* Merge directly into the output file if possible. */
3858 FILE **fps;
3859 size_t nopened = open_input_files (files, nfiles, &fps);
3861 if (nopened == nfiles)
3863 FILE *ofp = stream_open (output_file, "w");
3864 if (ofp)
3866 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3867 break;
3869 if (errno != EMFILE || nopened <= 2)
3870 sort_die (_("open failed"), output_file);
3872 else if (nopened <= 2)
3873 sort_die (_("open failed"), files[nopened].name);
3875 /* We ran out of file descriptors. Close one of the input
3876 files, to gain a file descriptor. Then create a temporary
3877 file with our spare file descriptor. Retry if that failed
3878 (e.g., some other process could open a file between the time
3879 we closed and tried to create). */
3880 FILE *tfp;
3881 struct tempnode *temp;
3884 nopened--;
3885 xfclose (fps[nopened], files[nopened].name);
3886 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3888 while (!temp);
3890 /* Merge into the newly allocated temporary. */
3891 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3892 fps);
3893 ntemps -= MIN (ntemps, nopened);
3894 files[0].name = temp->name;
3895 files[0].temp = temp;
3897 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3898 ntemps++;
3899 nfiles -= nopened - 1;
3903 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3905 static void
3906 sort (char *const *files, size_t nfiles, char const *output_file,
3907 size_t nthreads)
3909 struct buffer buf;
3910 IF_LINT (buf.buf = NULL);
3911 size_t ntemps = 0;
3912 bool output_file_created = false;
3914 buf.alloc = 0;
3916 while (nfiles)
3918 char const *temp_output;
3919 char const *file = *files;
3920 FILE *fp = xfopen (file, "r");
3921 FILE *tfp;
3923 size_t bytes_per_line;
3924 if (nthreads > 1)
3926 /* Get log P. */
3927 size_t tmp = 1;
3928 size_t mult = 1;
3929 while (tmp < nthreads)
3931 tmp *= 2;
3932 mult++;
3934 bytes_per_line = (mult * sizeof (struct line));
3936 else
3937 bytes_per_line = sizeof (struct line) * 3 / 2;
3939 if (! buf.alloc)
3940 initbuf (&buf, bytes_per_line,
3941 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3942 buf.eof = false;
3943 files++;
3944 nfiles--;
3946 while (fillbuf (&buf, fp, file))
3948 struct line *line;
3950 if (buf.eof && nfiles
3951 && (bytes_per_line + 1
3952 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3954 /* End of file, but there is more input and buffer room.
3955 Concatenate the next input file; this is faster in
3956 the usual case. */
3957 buf.left = buf.used;
3958 break;
3961 saved_line.text = NULL;
3962 line = buffer_linelim (&buf);
3963 if (buf.eof && !nfiles && !ntemps && !buf.left)
3965 xfclose (fp, file);
3966 tfp = xfopen (output_file, "w");
3967 temp_output = output_file;
3968 output_file_created = true;
3970 else
3972 ++ntemps;
3973 temp_output = create_temp (&tfp)->name;
3975 if (1 < buf.nlines)
3977 struct merge_node_queue queue;
3978 queue_init (&queue, nthreads);
3979 struct merge_node *merge_tree =
3980 merge_tree_init (nthreads, buf.nlines, line);
3982 sortlines (line, nthreads, buf.nlines, merge_tree + 1,
3983 &queue, tfp, temp_output);
3985 #ifdef lint
3986 merge_tree_destroy (nthreads, merge_tree);
3987 queue_destroy (&queue);
3988 #endif
3990 else
3991 write_unique (line - 1, tfp, temp_output);
3993 xfclose (tfp, temp_output);
3995 if (output_file_created)
3996 goto finish;
3998 xfclose (fp, file);
4001 finish:
4002 free (buf.buf);
4004 if (! output_file_created)
4006 struct tempnode *node = temphead;
4007 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
4008 for (size_t i = 0; node; i++)
4010 tempfiles[i].name = node->name;
4011 tempfiles[i].temp = node;
4012 node = node->next;
4014 merge (tempfiles, ntemps, ntemps, output_file);
4015 free (tempfiles);
4018 reap_all ();
4021 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
4023 static void
4024 insertkey (struct keyfield *key_arg)
4026 struct keyfield **p;
4027 struct keyfield *key = xmemdup (key_arg, sizeof *key);
4029 for (p = &keylist; *p; p = &(*p)->next)
4030 continue;
4031 *p = key;
4032 key->next = NULL;
4035 /* Report a bad field specification SPEC, with extra info MSGID. */
4037 static void badfieldspec (char const *, char const *)
4038 ATTRIBUTE_NORETURN;
4039 static void
4040 badfieldspec (char const *spec, char const *msgid)
4042 die (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
4043 _(msgid), quote (spec));
4046 /* Report incompatible options. */
4048 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
4049 static void
4050 incompatible_options (char const *opts)
4052 die (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
4055 /* Check compatibility of ordering options. */
4057 static void
4058 check_ordering_compatibility (void)
4060 struct keyfield *key;
4062 for (key = keylist; key; key = key->next)
4063 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4064 + key->month + (key->version | key->random | !!key->ignore)))
4066 /* The following is too big, but guaranteed to be "big enough". */
4067 char opts[sizeof short_options];
4068 /* Clear flags we're not interested in. */
4069 key->skipsblanks = key->skipeblanks = key->reverse = false;
4070 key_to_opts (key, opts);
4071 incompatible_options (opts);
4075 /* Parse the leading integer in STRING and store the resulting value
4076 (which must fit into size_t) into *VAL. Return the address of the
4077 suffix after the integer. If the value is too large, silently
4078 substitute SIZE_MAX. If MSGID is NULL, return NULL after
4079 failure; otherwise, report MSGID and exit on failure. */
4081 static char const *
4082 parse_field_count (char const *string, size_t *val, char const *msgid)
4084 char *suffix;
4085 uintmax_t n;
4087 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4089 case LONGINT_OK:
4090 case LONGINT_INVALID_SUFFIX_CHAR:
4091 *val = n;
4092 if (*val == n)
4093 break;
4094 FALLTHROUGH;
4095 case LONGINT_OVERFLOW:
4096 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4097 *val = SIZE_MAX;
4098 break;
4100 case LONGINT_INVALID:
4101 if (msgid)
4102 die (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4103 _(msgid), quote (string));
4104 return NULL;
4107 return suffix;
4110 /* Handle interrupts and hangups. */
4112 static void
4113 sighandler (int sig)
4115 if (! SA_NOCLDSTOP)
4116 signal (sig, SIG_IGN);
4118 cleanup ();
4120 signal (sig, SIG_DFL);
4121 raise (sig);
4124 /* Set the ordering options for KEY specified in S.
4125 Return the address of the first character in S that
4126 is not a valid ordering option.
4127 BLANKTYPE is the kind of blanks that 'b' should skip. */
4129 static char *
4130 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4132 while (*s)
4134 switch (*s)
4136 case 'b':
4137 if (blanktype == bl_start || blanktype == bl_both)
4138 key->skipsblanks = true;
4139 if (blanktype == bl_end || blanktype == bl_both)
4140 key->skipeblanks = true;
4141 break;
4142 case 'd':
4143 key->ignore = nondictionary;
4144 break;
4145 case 'f':
4146 key->translate = fold_toupper;
4147 break;
4148 case 'g':
4149 key->general_numeric = true;
4150 break;
4151 case 'h':
4152 key->human_numeric = true;
4153 break;
4154 case 'i':
4155 /* Option order should not matter, so don't let -i override
4156 -d. -d implies -i, but -i does not imply -d. */
4157 if (! key->ignore)
4158 key->ignore = nonprinting;
4159 break;
4160 case 'M':
4161 key->month = true;
4162 break;
4163 case 'n':
4164 key->numeric = true;
4165 break;
4166 case 'R':
4167 key->random = true;
4168 break;
4169 case 'r':
4170 key->reverse = true;
4171 break;
4172 case 'V':
4173 key->version = true;
4174 break;
4175 default:
4176 return (char *) s;
4178 ++s;
4180 return (char *) s;
4183 /* Initialize KEY. */
4185 static struct keyfield *
4186 key_init (struct keyfield *key)
4188 memset (key, 0, sizeof *key);
4189 key->eword = SIZE_MAX;
4190 return key;
4194 main (int argc, char **argv)
4196 struct keyfield *key;
4197 struct keyfield key_buf;
4198 struct keyfield gkey;
4199 bool gkey_only = false;
4200 char const *s;
4201 int c = 0;
4202 char checkonly = 0;
4203 bool mergeonly = false;
4204 char *random_source = NULL;
4205 bool need_random = false;
4206 size_t nthreads = 0;
4207 size_t nfiles = 0;
4208 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4209 int posix_ver = posix2_version ();
4210 bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809);
4211 char **files;
4212 char *files_from = NULL;
4213 struct Tokens tok;
4214 char const *outfile = NULL;
4215 bool locale_ok;
4217 initialize_main (&argc, &argv);
4218 set_program_name (argv[0]);
4219 locale_ok = !! setlocale (LC_ALL, "");
4220 bindtextdomain (PACKAGE, LOCALEDIR);
4221 textdomain (PACKAGE);
4223 initialize_exit_failure (SORT_FAILURE);
4225 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4226 #if HAVE_NL_LANGINFO
4227 hard_LC_TIME = hard_locale (LC_TIME);
4228 #endif
4230 /* Get locale's representation of the decimal point. */
4232 struct lconv const *locale = localeconv ();
4234 /* If the locale doesn't define a decimal point, or if the decimal
4235 point is multibyte, use the C locale's decimal point. FIXME:
4236 add support for multibyte decimal points. */
4237 decimal_point = to_uchar (locale->decimal_point[0]);
4238 if (! decimal_point || locale->decimal_point[1])
4239 decimal_point = '.';
4241 /* FIXME: add support for multibyte thousands separators. */
4242 thousands_sep = to_uchar (*locale->thousands_sep);
4243 if (! thousands_sep || locale->thousands_sep[1])
4244 thousands_sep = -1;
4247 have_read_stdin = false;
4248 inittables ();
4251 size_t i;
4252 static int const sig[] =
4254 /* The usual suspects. */
4255 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4256 #ifdef SIGPOLL
4257 SIGPOLL,
4258 #endif
4259 #ifdef SIGPROF
4260 SIGPROF,
4261 #endif
4262 #ifdef SIGVTALRM
4263 SIGVTALRM,
4264 #endif
4265 #ifdef SIGXCPU
4266 SIGXCPU,
4267 #endif
4268 #ifdef SIGXFSZ
4269 SIGXFSZ,
4270 #endif
4272 enum { nsigs = ARRAY_CARDINALITY (sig) };
4274 #if SA_NOCLDSTOP
4275 struct sigaction act;
4277 sigemptyset (&caught_signals);
4278 for (i = 0; i < nsigs; i++)
4280 sigaction (sig[i], NULL, &act);
4281 if (act.sa_handler != SIG_IGN)
4282 sigaddset (&caught_signals, sig[i]);
4285 act.sa_handler = sighandler;
4286 act.sa_mask = caught_signals;
4287 act.sa_flags = 0;
4289 for (i = 0; i < nsigs; i++)
4290 if (sigismember (&caught_signals, sig[i]))
4291 sigaction (sig[i], &act, NULL);
4292 #else
4293 for (i = 0; i < nsigs; i++)
4294 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4296 signal (sig[i], sighandler);
4297 siginterrupt (sig[i], 1);
4299 #endif
4301 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4303 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4304 atexit (exit_cleanup);
4306 key_init (&gkey);
4307 gkey.sword = SIZE_MAX;
4309 files = xnmalloc (argc, sizeof *files);
4311 while (true)
4313 /* Parse an operand as a file after "--" was seen; or if
4314 pedantic and a file was seen, unless the POSIX version
4315 is not 1003.1-2001 and -c was not seen and the operand is
4316 "-o FILE" or "-oFILE". */
4317 int oi = -1;
4319 if (c == -1
4320 || (posixly_correct && nfiles != 0
4321 && ! (traditional_usage
4322 && ! checkonly
4323 && optind != argc
4324 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4325 && (argv[optind][2] || optind + 1 != argc)))
4326 || ((c = getopt_long (argc, argv, short_options,
4327 long_options, &oi))
4328 == -1))
4330 if (argc <= optind)
4331 break;
4332 files[nfiles++] = argv[optind++];
4334 else switch (c)
4336 case 1:
4337 key = NULL;
4338 if (optarg[0] == '+')
4340 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4341 && ISDIGIT (argv[optind][1]));
4342 traditional_usage |= minus_pos_usage && !posixly_correct;
4343 if (traditional_usage)
4345 /* Treat +POS1 [-POS2] as a key if possible; but silently
4346 treat an operand as a file if it is not a valid +POS1. */
4347 key = key_init (&key_buf);
4348 s = parse_field_count (optarg + 1, &key->sword, NULL);
4349 if (s && *s == '.')
4350 s = parse_field_count (s + 1, &key->schar, NULL);
4351 if (! (key->sword || key->schar))
4352 key->sword = SIZE_MAX;
4353 if (! s || *set_ordering (s, key, bl_start))
4354 key = NULL;
4355 else
4357 if (minus_pos_usage)
4359 char const *optarg1 = argv[optind++];
4360 s = parse_field_count (optarg1 + 1, &key->eword,
4361 N_("invalid number after '-'"));
4362 /* When called with a non-NULL message ID,
4363 parse_field_count cannot return NULL. Tell static
4364 analysis tools that dereferencing S is safe. */
4365 assert (s);
4366 if (*s == '.')
4367 s = parse_field_count (s + 1, &key->echar,
4368 N_("invalid number after '.'"));
4369 if (!key->echar && key->eword)
4371 /* obsolescent syntax +A.x -B.y is equivalent to:
4372 -k A+1.x+1,B.y (when y = 0)
4373 -k A+1.x+1,B+1.y (when y > 0)
4374 So eword is decremented as in the -k case
4375 only when the end field (B) is specified and
4376 echar (y) is 0. */
4377 key->eword--;
4379 if (*set_ordering (s, key, bl_end))
4380 badfieldspec (optarg1,
4381 N_("stray character in field spec"));
4383 key->traditional_used = true;
4384 insertkey (key);
4388 if (! key)
4389 files[nfiles++] = optarg;
4390 break;
4392 case SORT_OPTION:
4393 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4394 FALLTHROUGH;
4395 case 'b':
4396 case 'd':
4397 case 'f':
4398 case 'g':
4399 case 'h':
4400 case 'i':
4401 case 'M':
4402 case 'n':
4403 case 'r':
4404 case 'R':
4405 case 'V':
4407 char str[2];
4408 str[0] = c;
4409 str[1] = '\0';
4410 set_ordering (str, &gkey, bl_both);
4412 break;
4414 case CHECK_OPTION:
4415 c = (optarg
4416 ? XARGMATCH ("--check", optarg, check_args, check_types)
4417 : 'c');
4418 FALLTHROUGH;
4419 case 'c':
4420 case 'C':
4421 if (checkonly && checkonly != c)
4422 incompatible_options ("cC");
4423 checkonly = c;
4424 break;
4426 case COMPRESS_PROGRAM_OPTION:
4427 if (compress_program && !STREQ (compress_program, optarg))
4428 die (SORT_FAILURE, 0, _("multiple compress programs specified"));
4429 compress_program = optarg;
4430 break;
4432 case DEBUG_PROGRAM_OPTION:
4433 debug = true;
4434 break;
4436 case FILES0_FROM_OPTION:
4437 files_from = optarg;
4438 break;
4440 case 'k':
4441 key = key_init (&key_buf);
4443 /* Get POS1. */
4444 s = parse_field_count (optarg, &key->sword,
4445 N_("invalid number at field start"));
4446 if (! key->sword--)
4448 /* Provoke with 'sort -k0' */
4449 badfieldspec (optarg, N_("field number is zero"));
4451 if (*s == '.')
4453 s = parse_field_count (s + 1, &key->schar,
4454 N_("invalid number after '.'"));
4455 if (! key->schar--)
4457 /* Provoke with 'sort -k1.0' */
4458 badfieldspec (optarg, N_("character offset is zero"));
4461 if (! (key->sword || key->schar))
4462 key->sword = SIZE_MAX;
4463 s = set_ordering (s, key, bl_start);
4464 if (*s != ',')
4466 key->eword = SIZE_MAX;
4467 key->echar = 0;
4469 else
4471 /* Get POS2. */
4472 s = parse_field_count (s + 1, &key->eword,
4473 N_("invalid number after ','"));
4474 if (! key->eword--)
4476 /* Provoke with 'sort -k1,0' */
4477 badfieldspec (optarg, N_("field number is zero"));
4479 if (*s == '.')
4481 s = parse_field_count (s + 1, &key->echar,
4482 N_("invalid number after '.'"));
4484 s = set_ordering (s, key, bl_end);
4486 if (*s)
4487 badfieldspec (optarg, N_("stray character in field spec"));
4488 insertkey (key);
4489 break;
4491 case 'm':
4492 mergeonly = true;
4493 break;
4495 case NMERGE_OPTION:
4496 specify_nmerge (oi, c, optarg);
4497 break;
4499 case 'o':
4500 if (outfile && !STREQ (outfile, optarg))
4501 die (SORT_FAILURE, 0, _("multiple output files specified"));
4502 outfile = optarg;
4503 break;
4505 case RANDOM_SOURCE_OPTION:
4506 if (random_source && !STREQ (random_source, optarg))
4507 die (SORT_FAILURE, 0, _("multiple random sources specified"));
4508 random_source = optarg;
4509 break;
4511 case 's':
4512 stable = true;
4513 break;
4515 case 'S':
4516 specify_sort_size (oi, c, optarg);
4517 break;
4519 case 't':
4521 char newtab = optarg[0];
4522 if (! newtab)
4523 die (SORT_FAILURE, 0, _("empty tab"));
4524 if (optarg[1])
4526 if (STREQ (optarg, "\\0"))
4527 newtab = '\0';
4528 else
4530 /* Provoke with 'sort -txx'. Complain about
4531 "multi-character tab" instead of "multibyte tab", so
4532 that the diagnostic's wording does not need to be
4533 changed once multibyte characters are supported. */
4534 die (SORT_FAILURE, 0, _("multi-character tab %s"),
4535 quote (optarg));
4538 if (tab != TAB_DEFAULT && tab != newtab)
4539 die (SORT_FAILURE, 0, _("incompatible tabs"));
4540 tab = newtab;
4542 break;
4544 case 'T':
4545 add_temp_dir (optarg);
4546 break;
4548 case PARALLEL_OPTION:
4549 nthreads = specify_nthreads (oi, c, optarg);
4550 break;
4552 case 'u':
4553 unique = true;
4554 break;
4556 case 'y':
4557 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4558 through Solaris 7. It is also accepted by many non-Solaris
4559 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4560 -y is marked as obsolete starting with Solaris 8 (1999), but is
4561 still accepted as of Solaris 10 prerelease (2004).
4563 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4564 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4565 and which in general ignores the argument after "-y" if it
4566 consists entirely of digits (it can even be empty). */
4567 if (optarg == argv[optind - 1])
4569 char const *p;
4570 for (p = optarg; ISDIGIT (*p); p++)
4571 continue;
4572 optind -= (*p != '\0');
4574 break;
4576 case 'z':
4577 eolchar = 0;
4578 break;
4580 case_GETOPT_HELP_CHAR;
4582 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4584 default:
4585 usage (SORT_FAILURE);
4589 if (files_from)
4591 /* When using --files0-from=F, you may not specify any files
4592 on the command-line. */
4593 if (nfiles)
4595 error (0, 0, _("extra operand %s"), quoteaf (files[0]));
4596 fprintf (stderr, "%s\n",
4597 _("file operands cannot be combined with --files0-from"));
4598 usage (SORT_FAILURE);
4601 FILE *stream = xfopen (files_from, "r");
4603 readtokens0_init (&tok);
4605 if (! readtokens0 (stream, &tok))
4606 die (SORT_FAILURE, 0, _("cannot read file names from %s"),
4607 quoteaf (files_from));
4608 xfclose (stream, files_from);
4610 if (tok.n_tok)
4612 free (files);
4613 files = tok.tok;
4614 nfiles = tok.n_tok;
4615 for (size_t i = 0; i < nfiles; i++)
4617 if (STREQ (files[i], "-"))
4618 die (SORT_FAILURE, 0, _("when reading file names from stdin, "
4619 "no file name of %s allowed"),
4620 quoteaf (files[i]));
4621 else if (files[i][0] == '\0')
4623 /* Using the standard 'filename:line-number:' prefix here is
4624 not totally appropriate, since NUL is the separator,
4625 not NL, but it might be better than nothing. */
4626 unsigned long int file_number = i + 1;
4627 die (SORT_FAILURE, 0,
4628 _("%s:%lu: invalid zero-length file name"),
4629 quotef (files_from), file_number);
4633 else
4634 die (SORT_FAILURE, 0, _("no input from %s"),
4635 quoteaf (files_from));
4638 /* Inheritance of global options to individual keys. */
4639 for (key = keylist; key; key = key->next)
4641 if (default_key_compare (key) && !key->reverse)
4643 key->ignore = gkey.ignore;
4644 key->translate = gkey.translate;
4645 key->skipsblanks = gkey.skipsblanks;
4646 key->skipeblanks = gkey.skipeblanks;
4647 key->month = gkey.month;
4648 key->numeric = gkey.numeric;
4649 key->general_numeric = gkey.general_numeric;
4650 key->human_numeric = gkey.human_numeric;
4651 key->version = gkey.version;
4652 key->random = gkey.random;
4653 key->reverse = gkey.reverse;
4656 need_random |= key->random;
4659 if (!keylist && !default_key_compare (&gkey))
4661 gkey_only = true;
4662 insertkey (&gkey);
4663 need_random |= gkey.random;
4666 check_ordering_compatibility ();
4668 if (debug)
4670 if (checkonly || outfile)
4672 static char opts[] = "X --debug";
4673 opts[0] = (checkonly ? checkonly : 'o');
4674 incompatible_options (opts);
4677 /* Always output the locale in debug mode, since this
4678 is such a common source of confusion. */
4680 /* OpenBSD can only set some categories with LC_ALL above,
4681 so set LC_COLLATE explicitly to flag errors. */
4682 if (locale_ok)
4683 locale_ok = !! setlocale (LC_COLLATE, "");
4684 if (! locale_ok)
4685 error (0, 0, "%s", _("failed to set locale"));
4686 if (hard_LC_COLLATE)
4687 error (0, 0, _("using %s sorting rules"),
4688 quote (setlocale (LC_COLLATE, NULL)));
4689 else
4690 error (0, 0, "%s", _("using simple byte comparison"));
4692 key_warnings (&gkey, gkey_only);
4695 reverse = gkey.reverse;
4697 if (need_random)
4698 random_md5_state_init (random_source);
4700 if (temp_dir_count == 0)
4702 char const *tmp_dir = getenv ("TMPDIR");
4703 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4706 if (nfiles == 0)
4708 nfiles = 1;
4709 free (files);
4710 files = xmalloc (sizeof *files);
4711 *files = (char *) "-";
4714 /* Need to re-check that we meet the minimum requirement for memory
4715 usage with the final value for NMERGE. */
4716 if (0 < sort_size)
4717 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4719 if (checkonly)
4721 if (nfiles > 1)
4722 die (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4723 quoteaf (files[1]), checkonly);
4725 if (outfile)
4727 static char opts[] = {0, 'o', 0};
4728 opts[0] = checkonly;
4729 incompatible_options (opts);
4732 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4733 input is not properly sorted. */
4734 return check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER;
4737 /* Check all inputs are accessible, or exit immediately. */
4738 check_inputs (files, nfiles);
4740 /* Check output is writable, or exit immediately. */
4741 check_output (outfile);
4743 if (mergeonly)
4745 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4747 for (size_t i = 0; i < nfiles; ++i)
4748 sortfiles[i].name = files[i];
4750 merge (sortfiles, 0, nfiles, outfile);
4751 IF_LINT (free (sortfiles));
4753 else
4755 if (!nthreads)
4757 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4758 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4761 /* Avoid integer overflow later. */
4762 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4763 nthreads = MIN (nthreads, nthreads_max);
4765 sort (files, nfiles, outfile, nthreads);
4768 #ifdef lint
4769 if (files_from)
4770 readtokens0_free (&tok);
4771 else
4772 free (files);
4773 #endif
4775 if (have_read_stdin && fclose (stdin) == EOF)
4776 sort_die (_("close failed"), "-");
4778 return EXIT_SUCCESS;