doc: improve the description of sort --random-sort
[coreutils.git] / src / sort.c
blobb1948f20bc00b73e9d0058fce64b2df44b3f9393
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/resource.h>
28 #include <sys/types.h>
29 #include <sys/wait.h>
30 #include <signal.h>
31 #include <assert.h>
32 #include "system.h"
33 #include "argmatch.h"
34 #include "error.h"
35 #include "fadvise.h"
36 #include "filevercmp.h"
37 #include "hard-locale.h"
38 #include "hash.h"
39 #include "heap.h"
40 #include "ignore-value.h"
41 #include "md5.h"
42 #include "mbswidth.h"
43 #include "nproc.h"
44 #include "physmem.h"
45 #include "posixver.h"
46 #include "quote.h"
47 #include "randread.h"
48 #include "readtokens0.h"
49 #include "stdio--.h"
50 #include "stdlib--.h"
51 #include "strnumcmp.h"
52 #include "xmemcoll.h"
53 #include "xnanosleep.h"
54 #include "xstrtol.h"
56 #ifndef RLIMIT_DATA
57 struct rlimit { size_t rlim_cur; };
58 # define getrlimit(Resource, Rlp) (-1)
59 #endif
61 /* The official name of this program (e.g., no 'g' prefix). */
62 #define PROGRAM_NAME "sort"
64 #define AUTHORS \
65 proper_name ("Mike Haertel"), \
66 proper_name ("Paul Eggert")
68 #if HAVE_LANGINFO_CODESET
69 # include <langinfo.h>
70 #endif
72 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
73 present. */
74 #ifndef SA_NOCLDSTOP
75 # define SA_NOCLDSTOP 0
76 /* No sigprocmask. Always 'return' zero. */
77 # define sigprocmask(How, Set, Oset) (0)
78 # define sigset_t int
79 # if ! HAVE_SIGINTERRUPT
80 # define siginterrupt(sig, flag) /* empty */
81 # endif
82 #endif
84 #if !defined OPEN_MAX && defined NR_OPEN
85 # define OPEN_MAX NR_OPEN
86 #endif
87 #if !defined OPEN_MAX
88 # define OPEN_MAX 20
89 #endif
91 #define UCHAR_LIM (UCHAR_MAX + 1)
93 #if HAVE_C99_STRTOLD
94 # define long_double long double
95 #else
96 # define long_double double
97 # undef strtold
98 # define strtold strtod
99 #endif
101 #ifndef DEFAULT_TMPDIR
102 # define DEFAULT_TMPDIR "/tmp"
103 #endif
105 /* Maximum number of lines to merge every time a NODE is taken from
106 the merge queue. Node is at LEVEL in the binary merge tree,
107 and is responsible for merging TOTAL lines. */
108 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
110 /* Heuristic value for the number of lines for which it is worth creating
111 a subthread, during an internal merge sort. I.e., it is a small number
112 of "average" lines for which sorting via two threads is faster than
113 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
114 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
115 lines of gensort -a output is sorted slightly faster with --parallel=2
116 than with --parallel=1. By contrast, using --parallel=1 is about 10%
117 faster than using --parallel=2 with a 64K-line input. */
118 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
119 verify (4 <= SUBTHREAD_LINES_HEURISTIC);
121 /* The number of threads after which there are
122 diminishing performance gains. */
123 enum { DEFAULT_MAX_THREADS = 8 };
125 /* Exit statuses. */
126 enum
128 /* POSIX says to exit with status 1 if invoked with -c and the
129 input is not properly sorted. */
130 SORT_OUT_OF_ORDER = 1,
132 /* POSIX says any other irregular exit must exit with a status
133 code greater than 1. */
134 SORT_FAILURE = 2
137 enum
139 /* The number of times we should try to fork a compression process
140 (we retry if the fork call fails). We don't _need_ to compress
141 temp files, this is just to reduce disk access, so this number
142 can be small. Each retry doubles in duration. */
143 MAX_FORK_TRIES_COMPRESS = 4,
145 /* The number of times we should try to fork a decompression process.
146 If we can't fork a decompression process, we can't sort, so this
147 number should be big. Each retry doubles in duration. */
148 MAX_FORK_TRIES_DECOMPRESS = 9
151 enum
153 /* Level of the end-of-merge node, one level above the root. */
154 MERGE_END = 0,
156 /* Level of the root node in merge tree. */
157 MERGE_ROOT = 1
160 /* The representation of the decimal point in the current locale. */
161 static int decimal_point;
163 /* Thousands separator; if -1, then there isn't one. */
164 static int thousands_sep;
166 /* Nonzero if the corresponding locales are hard. */
167 static bool hard_LC_COLLATE;
168 #if HAVE_NL_LANGINFO
169 static bool hard_LC_TIME;
170 #endif
172 #define NONZERO(x) ((x) != 0)
174 /* The kind of blanks for '-b' to skip in various options. */
175 enum blanktype { bl_start, bl_end, bl_both };
177 /* The character marking end of line. Default to \n. */
178 static char eolchar = '\n';
180 /* Lines are held in core as counted strings. */
181 struct line
183 char *text; /* Text of the line. */
184 size_t length; /* Length including final newline. */
185 char *keybeg; /* Start of first key. */
186 char *keylim; /* Limit of first key. */
189 /* Input buffers. */
190 struct buffer
192 char *buf; /* Dynamically allocated buffer,
193 partitioned into 3 regions:
194 - input data;
195 - unused area;
196 - an array of lines, in reverse order. */
197 size_t used; /* Number of bytes used for input data. */
198 size_t nlines; /* Number of lines in the line array. */
199 size_t alloc; /* Number of bytes allocated. */
200 size_t left; /* Number of bytes left from previous reads. */
201 size_t line_bytes; /* Number of bytes to reserve for each line. */
202 bool eof; /* An EOF has been read. */
205 /* Sort key. */
206 struct keyfield
208 size_t sword; /* Zero-origin 'word' to start at. */
209 size_t schar; /* Additional characters to skip. */
210 size_t eword; /* Zero-origin last 'word' of key. */
211 size_t echar; /* Additional characters in field. */
212 bool const *ignore; /* Boolean array of characters to ignore. */
213 char const *translate; /* Translation applied to characters. */
214 bool skipsblanks; /* Skip leading blanks when finding start. */
215 bool skipeblanks; /* Skip leading blanks when finding end. */
216 bool numeric; /* Flag for numeric comparison. Handle
217 strings of digits with optional decimal
218 point, but no exponential notation. */
219 bool random; /* Sort by random hash of key. */
220 bool general_numeric; /* Flag for general, numeric comparison.
221 Handle numbers in exponential notation. */
222 bool human_numeric; /* Flag for sorting by human readable
223 units with either SI or IEC prefixes. */
224 bool month; /* Flag for comparison by month name. */
225 bool reverse; /* Reverse the sense of comparison. */
226 bool version; /* sort by version number */
227 bool obsolete_used; /* obsolescent key option format is used. */
228 struct keyfield *next; /* Next keyfield to try. */
231 struct month
233 char const *name;
234 int val;
237 /* Binary merge tree node. */
238 struct merge_node
240 struct line *lo; /* Lines to merge from LO child node. */
241 struct line *hi; /* Lines to merge from HI child ndoe. */
242 struct line *end_lo; /* End of available lines from LO. */
243 struct line *end_hi; /* End of available lines from HI. */
244 struct line **dest; /* Pointer to destination of merge. */
245 size_t nlo; /* Total Lines remaining from LO. */
246 size_t nhi; /* Total lines remaining from HI. */
247 struct merge_node *parent; /* Parent node. */
248 struct merge_node *lo_child; /* LO child node. */
249 struct merge_node *hi_child; /* HI child node. */
250 unsigned int level; /* Level in merge tree. */
251 bool queued; /* Node is already in heap. */
252 pthread_mutex_t lock; /* Lock for node operations. */
255 /* Priority queue of merge nodes. */
256 struct merge_node_queue
258 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
259 pthread_mutex_t mutex; /* Lock for queue operations. */
260 pthread_cond_t cond; /* Conditional wait for empty queue to populate
261 when popping. */
264 /* Used to implement --unique (-u). */
265 static struct line saved_line;
267 /* FIXME: None of these tables work with multibyte character sets.
268 Also, there are many other bugs when handling multibyte characters.
269 One way to fix this is to rewrite 'sort' to use wide characters
270 internally, but doing this with good performance is a bit
271 tricky. */
273 /* Table of blanks. */
274 static bool blanks[UCHAR_LIM];
276 /* Table of non-printing characters. */
277 static bool nonprinting[UCHAR_LIM];
279 /* Table of non-dictionary characters (not letters, digits, or blanks). */
280 static bool nondictionary[UCHAR_LIM];
282 /* Translation table folding lower case to upper. */
283 static char fold_toupper[UCHAR_LIM];
285 #define MONTHS_PER_YEAR 12
287 /* Table mapping month names to integers.
288 Alphabetic order allows binary search. */
289 static struct month monthtab[] =
291 {"APR", 4},
292 {"AUG", 8},
293 {"DEC", 12},
294 {"FEB", 2},
295 {"JAN", 1},
296 {"JUL", 7},
297 {"JUN", 6},
298 {"MAR", 3},
299 {"MAY", 5},
300 {"NOV", 11},
301 {"OCT", 10},
302 {"SEP", 9}
305 /* During the merge phase, the number of files to merge at once. */
306 #define NMERGE_DEFAULT 16
308 /* Minimum size for a merge or check buffer. */
309 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
311 /* Minimum sort size; the code might not work with smaller sizes. */
312 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
314 /* The number of bytes needed for a merge or check buffer, which can
315 function relatively efficiently even if it holds only one line. If
316 a longer line is seen, this value is increased. */
317 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
319 /* The approximate maximum number of bytes of main memory to use, as
320 specified by the user. Zero if the user has not specified a size. */
321 static size_t sort_size;
323 /* The initial allocation factor for non-regular files.
324 This is used, e.g., when reading from a pipe.
325 Don't make it too big, since it is multiplied by ~130 to
326 obtain the size of the actual buffer sort will allocate.
327 Also, there may be 8 threads all doing this at the same time. */
328 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
330 /* Array of directory names in which any temporary files are to be created. */
331 static char const **temp_dirs;
333 /* Number of temporary directory names used. */
334 static size_t temp_dir_count;
336 /* Number of allocated slots in temp_dirs. */
337 static size_t temp_dir_alloc;
339 /* Flag to reverse the order of all comparisons. */
340 static bool reverse;
342 /* Flag for stable sort. This turns off the last ditch bytewise
343 comparison of lines, and instead leaves lines in the same order
344 they were read if all keys compare equal. */
345 static bool stable;
347 /* If TAB has this value, blanks separate fields. */
348 enum { TAB_DEFAULT = CHAR_MAX + 1 };
350 /* Tab character separating fields. If TAB_DEFAULT, then fields are
351 separated by the empty string between a non-blank character and a blank
352 character. */
353 static int tab = TAB_DEFAULT;
355 /* Flag to remove consecutive duplicate lines from the output.
356 Only the last of a sequence of equal lines will be output. */
357 static bool unique;
359 /* Nonzero if any of the input files are the standard input. */
360 static bool have_read_stdin;
362 /* List of key field comparisons to be tried. */
363 static struct keyfield *keylist;
365 /* Program used to (de)compress temp files. Must accept -d. */
366 static char const *compress_program;
368 /* Annotate the output with extra info to aid the user. */
369 static bool debug;
371 /* Maximum number of files to merge in one go. If more than this
372 number are present, temp files will be used. */
373 static unsigned int nmerge = NMERGE_DEFAULT;
375 /* Output an error to stderr using async-signal-safe routines, and _exit().
376 This can be used safely from signal handlers,
377 and between fork() and exec() of multithreaded processes. */
379 static void async_safe_die (int, const char *) ATTRIBUTE_NORETURN;
380 static void
381 async_safe_die (int errnum, const char *errstr)
383 ignore_value (write (STDERR_FILENO, errstr, strlen (errstr)));
385 /* Even if defined HAVE_STRERROR_R, we can't use it,
386 as it may return a translated string etc. and even if not
387 may malloc() which is unsafe. We might improve this
388 by testing for sys_errlist and using that if available.
389 For now just report the error number. */
390 if (errnum)
392 char errbuf[INT_BUFSIZE_BOUND (errnum)];
393 char *p = inttostr (errnum, errbuf);
394 ignore_value (write (STDERR_FILENO, ": errno ", 8));
395 ignore_value (write (STDERR_FILENO, p, strlen (p)));
398 ignore_value (write (STDERR_FILENO, "\n", 1));
400 _exit (SORT_FAILURE);
403 /* Report MESSAGE for FILE, then clean up and exit.
404 If FILE is null, it represents standard output. */
406 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
407 static void
408 die (char const *message, char const *file)
410 error (0, errno, "%s: %s", message,
411 quotef (file ? file : _("standard output")));
412 exit (SORT_FAILURE);
415 void
416 usage (int status)
418 if (status != EXIT_SUCCESS)
419 emit_try_help ();
420 else
422 printf (_("\
423 Usage: %s [OPTION]... [FILE]...\n\
424 or: %s [OPTION]... --files0-from=F\n\
426 program_name, program_name);
427 fputs (_("\
428 Write sorted concatenation of all FILE(s) to standard output.\n\
429 "), stdout);
431 emit_stdin_note ();
432 emit_mandatory_arg_note ();
434 fputs (_("\
435 Ordering options:\n\
437 "), stdout);
438 fputs (_("\
439 -b, --ignore-leading-blanks ignore leading blanks\n\
440 -d, --dictionary-order consider only blanks and alphanumeric characters\
442 -f, --ignore-case fold lower case to upper case characters\n\
443 "), stdout);
444 fputs (_("\
445 -g, --general-numeric-sort compare according to general numerical value\n\
446 -i, --ignore-nonprinting consider only printable characters\n\
447 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
448 "), stdout);
449 fputs (_("\
450 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
451 "), stdout);
452 fputs (_("\
453 -n, --numeric-sort compare according to string numerical value\n\
454 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
455 --random-source=FILE get random bytes from FILE\n\
456 -r, --reverse reverse the result of comparisons\n\
457 "), stdout);
458 fputs (_("\
459 --sort=WORD sort according to WORD:\n\
460 general-numeric -g, human-numeric -h, month -M,\
462 numeric -n, random -R, version -V\n\
463 -V, --version-sort natural sort of (version) numbers within text\n\
465 "), stdout);
466 fputs (_("\
467 Other options:\n\
469 "), stdout);
470 fputs (_("\
471 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
472 for more use temp files\n\
473 "), stdout);
474 fputs (_("\
475 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
476 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
478 --compress-program=PROG compress temporaries with PROG;\n\
479 decompress them with PROG -d\n\
480 "), stdout);
481 fputs (_("\
482 --debug annotate the part of the line used to sort,\n\
483 and warn about questionable usage to stderr\n\
484 --files0-from=F read input from the files specified by\n\
485 NUL-terminated names in file F;\n\
486 If F is - then read names from standard input\n\
487 "), stdout);
488 fputs (_("\
489 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
490 -m, --merge merge already sorted files; do not sort\n\
491 "), stdout);
492 fputs (_("\
493 -o, --output=FILE write result to FILE instead of standard output\n\
494 -s, --stable stabilize sort by disabling last-resort comparison\
496 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
497 "), stdout);
498 printf (_("\
499 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
500 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
501 multiple options specify multiple directories\n\
502 --parallel=N change the number of sorts run concurrently to N\n\
503 -u, --unique with -c, check for strict ordering;\n\
504 without -c, output only the first of an equal run\
506 "), DEFAULT_TMPDIR);
507 fputs (_("\
508 -z, --zero-terminated line delimiter is NUL, not newline\n\
509 "), stdout);
510 fputs (HELP_OPTION_DESCRIPTION, stdout);
511 fputs (VERSION_OPTION_DESCRIPTION, stdout);
512 fputs (_("\
514 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
515 field number and C a character position in the field; both are origin 1, and\n\
516 the stop position defaults to the line's end. If neither -t nor -b is in\n\
517 effect, characters in a field are counted from the beginning of the preceding\n\
518 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
520 which override global ordering options for that key. If no key is given, use\n\
521 the entire line as the key.\n\
523 SIZE may be followed by the following multiplicative suffixes:\n\
524 "), stdout);
525 fputs (_("\
526 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
528 *** WARNING ***\n\
529 The locale specified by the environment affects sort order.\n\
530 Set LC_ALL=C to get the traditional sort order that uses\n\
531 native byte values.\n\
532 "), stdout );
533 emit_ancillary_info (PROGRAM_NAME);
536 exit (status);
539 /* For long options that have no equivalent short option, use a
540 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
541 enum
543 CHECK_OPTION = CHAR_MAX + 1,
544 COMPRESS_PROGRAM_OPTION,
545 DEBUG_PROGRAM_OPTION,
546 FILES0_FROM_OPTION,
547 NMERGE_OPTION,
548 RANDOM_SOURCE_OPTION,
549 SORT_OPTION,
550 PARALLEL_OPTION
553 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
555 static struct option const long_options[] =
557 {"ignore-leading-blanks", no_argument, NULL, 'b'},
558 {"check", optional_argument, NULL, CHECK_OPTION},
559 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
560 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
561 {"dictionary-order", no_argument, NULL, 'd'},
562 {"ignore-case", no_argument, NULL, 'f'},
563 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
564 {"general-numeric-sort", no_argument, NULL, 'g'},
565 {"ignore-nonprinting", no_argument, NULL, 'i'},
566 {"key", required_argument, NULL, 'k'},
567 {"merge", no_argument, NULL, 'm'},
568 {"month-sort", no_argument, NULL, 'M'},
569 {"numeric-sort", no_argument, NULL, 'n'},
570 {"human-numeric-sort", no_argument, NULL, 'h'},
571 {"version-sort", no_argument, NULL, 'V'},
572 {"random-sort", no_argument, NULL, 'R'},
573 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
574 {"sort", required_argument, NULL, SORT_OPTION},
575 {"output", required_argument, NULL, 'o'},
576 {"reverse", no_argument, NULL, 'r'},
577 {"stable", no_argument, NULL, 's'},
578 {"batch-size", required_argument, NULL, NMERGE_OPTION},
579 {"buffer-size", required_argument, NULL, 'S'},
580 {"field-separator", required_argument, NULL, 't'},
581 {"temporary-directory", required_argument, NULL, 'T'},
582 {"unique", no_argument, NULL, 'u'},
583 {"zero-terminated", no_argument, NULL, 'z'},
584 {"parallel", required_argument, NULL, PARALLEL_OPTION},
585 {GETOPT_HELP_OPTION_DECL},
586 {GETOPT_VERSION_OPTION_DECL},
587 {NULL, 0, NULL, 0},
590 #define CHECK_TABLE \
591 _ct_("quiet", 'C') \
592 _ct_("silent", 'C') \
593 _ct_("diagnose-first", 'c')
595 static char const *const check_args[] =
597 #define _ct_(_s, _c) _s,
598 CHECK_TABLE NULL
599 #undef _ct_
601 static char const check_types[] =
603 #define _ct_(_s, _c) _c,
604 CHECK_TABLE
605 #undef _ct_
608 #define SORT_TABLE \
609 _st_("general-numeric", 'g') \
610 _st_("human-numeric", 'h') \
611 _st_("month", 'M') \
612 _st_("numeric", 'n') \
613 _st_("random", 'R') \
614 _st_("version", 'V')
616 static char const *const sort_args[] =
618 #define _st_(_s, _c) _s,
619 SORT_TABLE NULL
620 #undef _st_
622 static char const sort_types[] =
624 #define _st_(_s, _c) _c,
625 SORT_TABLE
626 #undef _st_
629 /* The set of signals that are caught. */
630 static sigset_t caught_signals;
632 /* Critical section status. */
633 struct cs_status
635 bool valid;
636 sigset_t sigs;
639 /* Enter a critical section. */
640 static struct cs_status
641 cs_enter (void)
643 struct cs_status status;
644 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
645 return status;
648 /* Leave a critical section. */
649 static void
650 cs_leave (struct cs_status status)
652 if (status.valid)
654 /* Ignore failure when restoring the signal mask. */
655 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
659 /* Possible states for a temp file. If compressed, the file's status
660 is unreaped or reaped, depending on whether 'sort' has waited for
661 the subprocess to finish. */
662 enum { UNCOMPRESSED, UNREAPED, REAPED };
664 /* The list of temporary files. */
665 struct tempnode
667 struct tempnode *volatile next;
668 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
669 char state;
670 char name[1]; /* Actual size is 1 + file name length. */
672 static struct tempnode *volatile temphead;
673 static struct tempnode *volatile *temptail = &temphead;
675 /* A file to be sorted. */
676 struct sortfile
678 /* The file's name. */
679 char const *name;
681 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
682 struct tempnode *temp;
685 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
686 static Hash_table *proctab;
688 enum { INIT_PROCTAB_SIZE = 47 };
690 static size_t
691 proctab_hasher (void const *entry, size_t tabsize)
693 struct tempnode const *node = entry;
694 return node->pid % tabsize;
697 static bool
698 proctab_comparator (void const *e1, void const *e2)
700 struct tempnode const *n1 = e1;
701 struct tempnode const *n2 = e2;
702 return n1->pid == n2->pid;
705 /* The number of unreaped child processes. */
706 static pid_t nprocs;
708 static bool delete_proc (pid_t);
710 /* If PID is positive, wait for the child process with that PID to
711 exit, and assume that PID has already been removed from the process
712 table. If PID is 0 or -1, clean up some child that has exited (by
713 waiting for it, and removing it from the proc table) and return the
714 child's process ID. However, if PID is 0 and no children have
715 exited, return 0 without waiting. */
717 static pid_t
718 reap (pid_t pid)
720 int status;
721 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
723 if (cpid < 0)
724 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
725 quoteaf (compress_program));
726 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
728 if (! WIFEXITED (status) || WEXITSTATUS (status))
729 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
730 quoteaf (compress_program));
731 --nprocs;
734 return cpid;
737 /* TEMP represents a new process; add it to the process table. Create
738 the process table the first time it's called. */
740 static void
741 register_proc (struct tempnode *temp)
743 if (! proctab)
745 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
746 proctab_hasher,
747 proctab_comparator,
748 NULL);
749 if (! proctab)
750 xalloc_die ();
753 temp->state = UNREAPED;
755 if (! hash_insert (proctab, temp))
756 xalloc_die ();
759 /* If PID is in the process table, remove it and return true.
760 Otherwise, return false. */
762 static bool
763 delete_proc (pid_t pid)
765 struct tempnode test;
767 test.pid = pid;
768 struct tempnode *node = hash_delete (proctab, &test);
769 if (! node)
770 return false;
771 node->state = REAPED;
772 return true;
775 /* Remove PID from the process table, and wait for it to exit if it
776 hasn't already. */
778 static void
779 wait_proc (pid_t pid)
781 if (delete_proc (pid))
782 reap (pid);
785 /* Reap any exited children. Do not block; reap only those that have
786 already exited. */
788 static void
789 reap_exited (void)
791 while (0 < nprocs && reap (0))
792 continue;
795 /* Reap at least one exited child, waiting if necessary. */
797 static void
798 reap_some (void)
800 reap (-1);
801 reap_exited ();
804 /* Reap all children, waiting if necessary. */
806 static void
807 reap_all (void)
809 while (0 < nprocs)
810 reap (-1);
813 /* Clean up any remaining temporary files. */
815 static void
816 cleanup (void)
818 struct tempnode const *node;
820 for (node = temphead; node; node = node->next)
821 unlink (node->name);
822 temphead = NULL;
825 /* Cleanup actions to take when exiting. */
827 static void
828 exit_cleanup (void)
830 if (temphead)
832 /* Clean up any remaining temporary files in a critical section so
833 that a signal handler does not try to clean them too. */
834 struct cs_status cs = cs_enter ();
835 cleanup ();
836 cs_leave (cs);
839 close_stdout ();
842 /* Create a new temporary file, returning its newly allocated tempnode.
843 Store into *PFD the file descriptor open for writing.
844 If the creation fails, return NULL and store -1 into *PFD if the
845 failure is due to file descriptor exhaustion and
846 SURVIVE_FD_EXHAUSTION; otherwise, die. */
848 static struct tempnode *
849 create_temp_file (int *pfd, bool survive_fd_exhaustion)
851 static char const slashbase[] = "/sortXXXXXX";
852 static size_t temp_dir_index;
853 int fd;
854 int saved_errno;
855 char const *temp_dir = temp_dirs[temp_dir_index];
856 size_t len = strlen (temp_dir);
857 struct tempnode *node =
858 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
859 char *file = node->name;
860 struct cs_status cs;
862 memcpy (file, temp_dir, len);
863 memcpy (file + len, slashbase, sizeof slashbase);
864 node->next = NULL;
865 if (++temp_dir_index == temp_dir_count)
866 temp_dir_index = 0;
868 /* Create the temporary file in a critical section, to avoid races. */
869 cs = cs_enter ();
870 fd = mkstemp (file);
871 if (0 <= fd)
873 *temptail = node;
874 temptail = &node->next;
876 saved_errno = errno;
877 cs_leave (cs);
878 errno = saved_errno;
880 if (fd < 0)
882 if (! (survive_fd_exhaustion && errno == EMFILE))
883 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
884 quoteaf (temp_dir));
885 free (node);
886 node = NULL;
889 *pfd = fd;
890 return node;
893 /* Return a stream for FILE, opened with mode HOW. A null FILE means
894 standard output; HOW should be "w". When opening for input, "-"
895 means standard input. To avoid confusion, do not return file
896 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
897 opening an ordinary FILE. Return NULL if unsuccessful.
899 fadvise() is used to specify an access pattern for input files.
900 There are a few hints we could possibly provide,
901 and after careful testing it was decided that
902 specifying POSIX_FADV_SEQUENTIAL was not detrimental
903 to any cases. On Linux 2.6.31, this option doubles
904 the size of read ahead performed and thus was seen to
905 benefit these cases:
906 Merging
907 Sorting with a smaller internal buffer
908 Reading from faster flash devices
910 In _addition_ one could also specify other hints...
912 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
913 at least uses that to _synchronously_ prepopulate the cache
914 with the specified range. While sort does need to
915 read all of its input before outputting, a synchronous
916 read of the whole file up front precludes any processing
917 that sort could do in parallel with the system doing
918 read ahead of the data. This was seen to have negative effects
919 in a couple of cases:
920 Merging
921 Sorting with a smaller internal buffer
922 Note this option was seen to shorten the runtime for sort
923 on a multicore system with lots of RAM and other processes
924 competing for CPU. It could be argued that more explicit
925 scheduling hints with 'nice' et. al. are more appropriate
926 for this situation.
928 POSIX_FADV_NOREUSE is a possibility as it could lower
929 the priority of input data in the cache as sort will
930 only need to process it once. However its functionality
931 has changed over Linux kernel versions and as of 2.6.31
932 it does nothing and thus we can't depend on what it might
933 do in future.
935 POSIX_FADV_DONTNEED is not appropriate for user specified
936 input files, but for temp files we do want to drop the
937 cache immediately after processing. This is done implicitly
938 however when the files are unlinked. */
940 static FILE *
941 stream_open (char const *file, char const *how)
943 FILE *fp;
945 if (*how == 'r')
947 if (STREQ (file, "-"))
949 have_read_stdin = true;
950 fp = stdin;
952 else
953 fp = fopen (file, how);
954 fadvise (fp, FADVISE_SEQUENTIAL);
956 else if (*how == 'w')
958 if (file && ftruncate (STDOUT_FILENO, 0) != 0)
959 error (SORT_FAILURE, errno, _("%s: error truncating"),
960 quotef (file));
961 fp = stdout;
963 else
964 assert (!"unexpected mode passed to stream_open");
966 return fp;
969 /* Same as stream_open, except always return a non-null value; die on
970 failure. */
972 static FILE *
973 xfopen (char const *file, char const *how)
975 FILE *fp = stream_open (file, how);
976 if (!fp)
977 die (_("open failed"), file);
978 return fp;
981 /* Close FP, whose name is FILE, and report any errors. */
983 static void
984 xfclose (FILE *fp, char const *file)
986 switch (fileno (fp))
988 case STDIN_FILENO:
989 /* Allow reading stdin from tty more than once. */
990 if (feof (fp))
991 clearerr (fp);
992 break;
994 case STDOUT_FILENO:
995 /* Don't close stdout just yet. close_stdout does that. */
996 if (fflush (fp) != 0)
997 die (_("fflush failed"), file);
998 break;
1000 default:
1001 if (fclose (fp) != 0)
1002 die (_("close failed"), file);
1003 break;
1007 static void
1008 move_fd_or_die (int oldfd, int newfd)
1010 if (oldfd != newfd)
1012 /* This should never fail for our usage. */
1013 dup2 (oldfd, newfd);
1014 close (oldfd);
1018 /* Fork a child process for piping to and do common cleanup. The
1019 TRIES parameter tells us how many times to try to fork before
1020 giving up. Return the PID of the child, or -1 (setting errno)
1021 on failure. */
1023 static pid_t
1024 pipe_fork (int pipefds[2], size_t tries)
1026 #if HAVE_WORKING_FORK
1027 struct tempnode *saved_temphead;
1028 int saved_errno;
1029 double wait_retry = 0.25;
1030 pid_t pid IF_LINT ( = -1);
1031 struct cs_status cs;
1033 if (pipe (pipefds) < 0)
1034 return -1;
1036 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1037 uncontrolled subprocess generation can hurt performance significantly.
1038 Allow at most NMERGE + 2 subprocesses, on the theory that there
1039 may be some useful parallelism by letting compression for the
1040 previous merge finish (1 subprocess) in parallel with the current
1041 merge (NMERGE + 1 subprocesses). */
1043 if (nmerge + 1 < nprocs)
1044 reap_some ();
1046 while (tries--)
1048 /* This is so the child process won't delete our temp files
1049 if it receives a signal before exec-ing. */
1050 cs = cs_enter ();
1051 saved_temphead = temphead;
1052 temphead = NULL;
1054 pid = fork ();
1055 saved_errno = errno;
1056 if (pid)
1057 temphead = saved_temphead;
1059 cs_leave (cs);
1060 errno = saved_errno;
1062 if (0 <= pid || errno != EAGAIN)
1063 break;
1064 else
1066 xnanosleep (wait_retry);
1067 wait_retry *= 2;
1068 reap_exited ();
1072 if (pid < 0)
1074 saved_errno = errno;
1075 close (pipefds[0]);
1076 close (pipefds[1]);
1077 errno = saved_errno;
1079 else if (pid == 0)
1081 close (STDIN_FILENO);
1082 close (STDOUT_FILENO);
1084 else
1085 ++nprocs;
1087 return pid;
1089 #else /* ! HAVE_WORKING_FORK */
1090 return -1;
1091 #endif
1094 /* Create a temporary file and, if asked for, start a compressor
1095 to that file. Set *PFP to the file handle and return
1096 the address of the new temp node. If the creation
1097 fails, return NULL if the failure is due to file descriptor
1098 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1100 static struct tempnode *
1101 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1103 int tempfd;
1104 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1105 if (! node)
1106 return NULL;
1108 node->state = UNCOMPRESSED;
1110 if (compress_program)
1112 int pipefds[2];
1114 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1115 if (0 < node->pid)
1117 close (tempfd);
1118 close (pipefds[0]);
1119 tempfd = pipefds[1];
1121 register_proc (node);
1123 else if (node->pid == 0)
1125 /* Being the child of a multithreaded program before exec(),
1126 we're restricted to calling async-signal-safe routines here. */
1127 close (pipefds[1]);
1128 move_fd_or_die (tempfd, STDOUT_FILENO);
1129 move_fd_or_die (pipefds[0], STDIN_FILENO);
1131 execlp (compress_program, compress_program, (char *) NULL);
1133 async_safe_die (errno, "couldn't execute compress program");
1137 *pfp = fdopen (tempfd, "w");
1138 if (! *pfp)
1139 die (_("couldn't create temporary file"), node->name);
1141 return node;
1144 /* Create a temporary file and, if asked for, start a compressor
1145 to that file. Set *PFP to the file handle and return the address
1146 of the new temp node. Die on failure. */
1148 static struct tempnode *
1149 create_temp (FILE **pfp)
1151 return maybe_create_temp (pfp, false);
1154 /* Open a compressed temp file and start a decompression process through
1155 which to filter the input. Return NULL (setting errno to
1156 EMFILE) if we ran out of file descriptors, and die on any other
1157 kind of failure. */
1159 static FILE *
1160 open_temp (struct tempnode *temp)
1162 int tempfd, pipefds[2];
1163 FILE *fp = NULL;
1165 if (temp->state == UNREAPED)
1166 wait_proc (temp->pid);
1168 tempfd = open (temp->name, O_RDONLY);
1169 if (tempfd < 0)
1170 return NULL;
1172 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1174 switch (child)
1176 case -1:
1177 if (errno != EMFILE)
1178 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1179 quoteaf (compress_program));
1180 close (tempfd);
1181 errno = EMFILE;
1182 break;
1184 case 0:
1185 /* Being the child of a multithreaded program before exec(),
1186 we're restricted to calling async-signal-safe routines here. */
1187 close (pipefds[0]);
1188 move_fd_or_die (tempfd, STDIN_FILENO);
1189 move_fd_or_die (pipefds[1], STDOUT_FILENO);
1191 execlp (compress_program, compress_program, "-d", (char *) NULL);
1193 async_safe_die (errno, "couldn't execute compress program (with -d)");
1195 default:
1196 temp->pid = child;
1197 register_proc (temp);
1198 close (tempfd);
1199 close (pipefds[1]);
1201 fp = fdopen (pipefds[0], "r");
1202 if (! fp)
1204 int saved_errno = errno;
1205 close (pipefds[0]);
1206 errno = saved_errno;
1208 break;
1211 return fp;
1214 /* Append DIR to the array of temporary directory names. */
1215 static void
1216 add_temp_dir (char const *dir)
1218 if (temp_dir_count == temp_dir_alloc)
1219 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1221 temp_dirs[temp_dir_count++] = dir;
1224 /* Remove NAME from the list of temporary files. */
1226 static void
1227 zaptemp (char const *name)
1229 struct tempnode *volatile *pnode;
1230 struct tempnode *node;
1231 struct tempnode *next;
1232 int unlink_status;
1233 int unlink_errno = 0;
1234 struct cs_status cs;
1236 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1237 continue;
1239 if (node->state == UNREAPED)
1240 wait_proc (node->pid);
1242 /* Unlink the temporary file in a critical section to avoid races. */
1243 next = node->next;
1244 cs = cs_enter ();
1245 unlink_status = unlink (name);
1246 unlink_errno = errno;
1247 *pnode = next;
1248 cs_leave (cs);
1250 if (unlink_status != 0)
1251 error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
1252 if (! next)
1253 temptail = pnode;
1254 free (node);
1257 #if HAVE_NL_LANGINFO
1259 static int
1260 struct_month_cmp (void const *m1, void const *m2)
1262 struct month const *month1 = m1;
1263 struct month const *month2 = m2;
1264 return strcmp (month1->name, month2->name);
1267 #endif
1269 /* Initialize the character class tables. */
1271 static void
1272 inittables (void)
1274 size_t i;
1276 for (i = 0; i < UCHAR_LIM; ++i)
1278 blanks[i] = !! isblank (i);
1279 nonprinting[i] = ! isprint (i);
1280 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1281 fold_toupper[i] = toupper (i);
1284 #if HAVE_NL_LANGINFO
1285 /* If we're not in the "C" locale, read different names for months. */
1286 if (hard_LC_TIME)
1288 for (i = 0; i < MONTHS_PER_YEAR; i++)
1290 char const *s;
1291 size_t s_len;
1292 size_t j, k;
1293 char *name;
1295 s = nl_langinfo (ABMON_1 + i);
1296 s_len = strlen (s);
1297 monthtab[i].name = name = xmalloc (s_len + 1);
1298 monthtab[i].val = i + 1;
1300 for (j = k = 0; j < s_len; j++)
1301 if (! isblank (to_uchar (s[j])))
1302 name[k++] = fold_toupper[to_uchar (s[j])];
1303 name[k] = '\0';
1305 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1307 #endif
1310 /* Specify how many inputs may be merged at once.
1311 This may be set on the command-line with the
1312 --batch-size option. */
1313 static void
1314 specify_nmerge (int oi, char c, char const *s)
1316 uintmax_t n;
1317 struct rlimit rlimit;
1318 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1320 /* Try to find out how many file descriptors we'll be able
1321 to open. We need at least nmerge + 3 (STDIN_FILENO,
1322 STDOUT_FILENO and STDERR_FILENO). */
1323 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1324 ? rlimit.rlim_cur
1325 : OPEN_MAX)
1326 - 3);
1328 if (e == LONGINT_OK)
1330 nmerge = n;
1331 if (nmerge != n)
1332 e = LONGINT_OVERFLOW;
1333 else
1335 if (nmerge < 2)
1337 error (0, 0, _("invalid --%s argument %s"),
1338 long_options[oi].name, quote (s));
1339 error (SORT_FAILURE, 0,
1340 _("minimum --%s argument is %s"),
1341 long_options[oi].name, quote ("2"));
1343 else if (max_nmerge < nmerge)
1345 e = LONGINT_OVERFLOW;
1347 else
1348 return;
1352 if (e == LONGINT_OVERFLOW)
1354 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1355 error (0, 0, _("--%s argument %s too large"),
1356 long_options[oi].name, quote (s));
1357 error (SORT_FAILURE, 0,
1358 _("maximum --%s argument with current rlimit is %s"),
1359 long_options[oi].name,
1360 uinttostr (max_nmerge, max_nmerge_buf));
1362 else
1363 xstrtol_fatal (e, oi, c, long_options, s);
1366 /* Specify the amount of main memory to use when sorting. */
1367 static void
1368 specify_sort_size (int oi, char c, char const *s)
1370 uintmax_t n;
1371 char *suffix;
1372 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1374 /* The default unit is KiB. */
1375 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1377 if (n <= UINTMAX_MAX / 1024)
1378 n *= 1024;
1379 else
1380 e = LONGINT_OVERFLOW;
1383 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1384 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1385 switch (suffix[0])
1387 case 'b':
1388 e = LONGINT_OK;
1389 break;
1391 case '%':
1393 double mem = physmem_total () * n / 100;
1395 /* Use "<", not "<=", to avoid problems with rounding. */
1396 if (mem < UINTMAX_MAX)
1398 n = mem;
1399 e = LONGINT_OK;
1401 else
1402 e = LONGINT_OVERFLOW;
1404 break;
1407 if (e == LONGINT_OK)
1409 /* If multiple sort sizes are specified, take the maximum, so
1410 that option order does not matter. */
1411 if (n < sort_size)
1412 return;
1414 sort_size = n;
1415 if (sort_size == n)
1417 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1418 return;
1421 e = LONGINT_OVERFLOW;
1424 xstrtol_fatal (e, oi, c, long_options, s);
1427 /* Specify the number of threads to spawn during internal sort. */
1428 static size_t
1429 specify_nthreads (int oi, char c, char const *s)
1431 unsigned long int nthreads;
1432 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1433 if (e == LONGINT_OVERFLOW)
1434 return SIZE_MAX;
1435 if (e != LONGINT_OK)
1436 xstrtol_fatal (e, oi, c, long_options, s);
1437 if (SIZE_MAX < nthreads)
1438 nthreads = SIZE_MAX;
1439 if (nthreads == 0)
1440 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1441 return nthreads;
1444 /* Return the default sort size. */
1445 static size_t
1446 default_sort_size (void)
1448 /* Let SIZE be MEM, but no more than the maximum object size,
1449 total memory, or system resource limits. Don't bother to check
1450 for values like RLIM_INFINITY since in practice they are not much
1451 less than SIZE_MAX. */
1452 size_t size = SIZE_MAX;
1453 struct rlimit rlimit;
1454 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1455 size = rlimit.rlim_cur;
1456 #ifdef RLIMIT_AS
1457 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1458 size = rlimit.rlim_cur;
1459 #endif
1461 /* Leave a large safety margin for the above limits, as failure can
1462 occur when they are exceeded. */
1463 size /= 2;
1465 #ifdef RLIMIT_RSS
1466 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1467 Exceeding RSS is not fatal, but can be quite slow. */
1468 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1469 size = rlimit.rlim_cur / 16 * 15;
1470 #endif
1472 /* Let MEM be available memory or 1/8 of total memory, whichever
1473 is greater. */
1474 double avail = physmem_available ();
1475 double total = physmem_total ();
1476 double mem = MAX (avail, total / 8);
1478 /* Leave a 1/4 margin for physical memory. */
1479 if (total * 0.75 < size)
1480 size = total * 0.75;
1482 /* Return the minimum of MEM and SIZE, but no less than
1483 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1484 right when only one argument is floating point. */
1485 if (mem < size)
1486 size = mem;
1487 return MAX (size, MIN_SORT_SIZE);
1490 /* Return the sort buffer size to use with the input files identified
1491 by FPS and FILES, which are alternate names of the same files.
1492 NFILES gives the number of input files; NFPS may be less. Assume
1493 that each input line requires LINE_BYTES extra bytes' worth of line
1494 information. Do not exceed the size bound specified by the user
1495 (or a default size bound, if the user does not specify one). */
1497 static size_t
1498 sort_buffer_size (FILE *const *fps, size_t nfps,
1499 char *const *files, size_t nfiles,
1500 size_t line_bytes)
1502 /* A bound on the input size. If zero, the bound hasn't been
1503 determined yet. */
1504 static size_t size_bound;
1506 /* In the worst case, each input byte is a newline. */
1507 size_t worst_case_per_input_byte = line_bytes + 1;
1509 /* Keep enough room for one extra input line and an extra byte.
1510 This extra room might be needed when preparing to read EOF. */
1511 size_t size = worst_case_per_input_byte + 1;
1513 size_t i;
1515 for (i = 0; i < nfiles; i++)
1517 struct stat st;
1518 off_t file_size;
1519 size_t worst_case;
1521 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1522 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1523 : stat (files[i], &st))
1524 != 0)
1525 die (_("stat failed"), files[i]);
1527 if (S_ISREG (st.st_mode))
1528 file_size = st.st_size;
1529 else
1531 /* The file has unknown size. If the user specified a sort
1532 buffer size, use that; otherwise, guess the size. */
1533 if (sort_size)
1534 return sort_size;
1535 file_size = INPUT_FILE_SIZE_GUESS;
1538 if (! size_bound)
1540 size_bound = sort_size;
1541 if (! size_bound)
1542 size_bound = default_sort_size ();
1545 /* Add the amount of memory needed to represent the worst case
1546 where the input consists entirely of newlines followed by a
1547 single non-newline. Check for overflow. */
1548 worst_case = file_size * worst_case_per_input_byte + 1;
1549 if (file_size != worst_case / worst_case_per_input_byte
1550 || size_bound - size <= worst_case)
1551 return size_bound;
1552 size += worst_case;
1555 return size;
1558 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1559 must be at least sizeof (struct line). Allocate ALLOC bytes
1560 initially. */
1562 static void
1563 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1565 /* Ensure that the line array is properly aligned. If the desired
1566 size cannot be allocated, repeatedly halve it until allocation
1567 succeeds. The smaller allocation may hurt overall performance,
1568 but that's better than failing. */
1569 while (true)
1571 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1572 buf->buf = malloc (alloc);
1573 if (buf->buf)
1574 break;
1575 alloc /= 2;
1576 if (alloc <= line_bytes + 1)
1577 xalloc_die ();
1580 buf->line_bytes = line_bytes;
1581 buf->alloc = alloc;
1582 buf->used = buf->left = buf->nlines = 0;
1583 buf->eof = false;
1586 /* Return one past the limit of the line array. */
1588 static inline struct line *
1589 buffer_linelim (struct buffer const *buf)
1591 void *linelim = buf->buf + buf->alloc;
1592 return linelim;
1595 /* Return a pointer to the first character of the field specified
1596 by KEY in LINE. */
1598 static char *
1599 begfield (struct line const *line, struct keyfield const *key)
1601 char *ptr = line->text, *lim = ptr + line->length - 1;
1602 size_t sword = key->sword;
1603 size_t schar = key->schar;
1605 /* The leading field separator itself is included in a field when -t
1606 is absent. */
1608 if (tab != TAB_DEFAULT)
1609 while (ptr < lim && sword--)
1611 while (ptr < lim && *ptr != tab)
1612 ++ptr;
1613 if (ptr < lim)
1614 ++ptr;
1616 else
1617 while (ptr < lim && sword--)
1619 while (ptr < lim && blanks[to_uchar (*ptr)])
1620 ++ptr;
1621 while (ptr < lim && !blanks[to_uchar (*ptr)])
1622 ++ptr;
1625 /* If we're ignoring leading blanks when computing the Start
1626 of the field, skip past them here. */
1627 if (key->skipsblanks)
1628 while (ptr < lim && blanks[to_uchar (*ptr)])
1629 ++ptr;
1631 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1632 ptr = MIN (lim, ptr + schar);
1634 return ptr;
1637 /* Return the limit of (a pointer to the first character after) the field
1638 in LINE specified by KEY. */
1640 static char *
1641 limfield (struct line const *line, struct keyfield const *key)
1643 char *ptr = line->text, *lim = ptr + line->length - 1;
1644 size_t eword = key->eword, echar = key->echar;
1646 if (echar == 0)
1647 eword++; /* Skip all of end field. */
1649 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1650 whichever comes first. If there are more than EWORD fields, leave
1651 PTR pointing at the beginning of the field having zero-based index,
1652 EWORD. If a delimiter character was specified (via -t), then that
1653 'beginning' is the first character following the delimiting TAB.
1654 Otherwise, leave PTR pointing at the first 'blank' character after
1655 the preceding field. */
1656 if (tab != TAB_DEFAULT)
1657 while (ptr < lim && eword--)
1659 while (ptr < lim && *ptr != tab)
1660 ++ptr;
1661 if (ptr < lim && (eword || echar))
1662 ++ptr;
1664 else
1665 while (ptr < lim && eword--)
1667 while (ptr < lim && blanks[to_uchar (*ptr)])
1668 ++ptr;
1669 while (ptr < lim && !blanks[to_uchar (*ptr)])
1670 ++ptr;
1673 #ifdef POSIX_UNSPECIFIED
1674 /* The following block of code makes GNU sort incompatible with
1675 standard Unix sort, so it's ifdef'd out for now.
1676 The POSIX spec isn't clear on how to interpret this.
1677 FIXME: request clarification.
1679 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1680 Date: Thu, 30 May 96 12:20:41 -0400
1681 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1683 [...]I believe I've found another bug in 'sort'.
1685 $ cat /tmp/sort.in
1686 a b c 2 d
1687 pq rs 1 t
1688 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1689 a b c 2 d
1690 pq rs 1 t
1691 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1692 pq rs 1 t
1693 a b c 2 d
1695 Unix sort produced the answer I expected: sort on the single character
1696 in column 7. GNU sort produced different results, because it disagrees
1697 on the interpretation of the key-end spec "M.N". Unix sort reads this
1698 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1699 "skip M-1 fields, then either N-1 characters or the rest of the current
1700 field, whichever comes first". This extra clause applies only to
1701 key-ends, not key-starts.
1704 /* Make LIM point to the end of (one byte past) the current field. */
1705 if (tab != TAB_DEFAULT)
1707 char *newlim;
1708 newlim = memchr (ptr, tab, lim - ptr);
1709 if (newlim)
1710 lim = newlim;
1712 else
1714 char *newlim;
1715 newlim = ptr;
1716 while (newlim < lim && blanks[to_uchar (*newlim)])
1717 ++newlim;
1718 while (newlim < lim && !blanks[to_uchar (*newlim)])
1719 ++newlim;
1720 lim = newlim;
1722 #endif
1724 if (echar != 0) /* We need to skip over a portion of the end field. */
1726 /* If we're ignoring leading blanks when computing the End
1727 of the field, skip past them here. */
1728 if (key->skipeblanks)
1729 while (ptr < lim && blanks[to_uchar (*ptr)])
1730 ++ptr;
1732 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1733 ptr = MIN (lim, ptr + echar);
1736 return ptr;
1739 /* Fill BUF reading from FP, moving buf->left bytes from the end
1740 of buf->buf to the beginning first. If EOF is reached and the
1741 file wasn't terminated by a newline, supply one. Set up BUF's line
1742 table too. FILE is the name of the file corresponding to FP.
1743 Return true if some input was read. */
1745 static bool
1746 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1748 struct keyfield const *key = keylist;
1749 char eol = eolchar;
1750 size_t line_bytes = buf->line_bytes;
1751 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1753 if (buf->eof)
1754 return false;
1756 if (buf->used != buf->left)
1758 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1759 buf->used = buf->left;
1760 buf->nlines = 0;
1763 while (true)
1765 char *ptr = buf->buf + buf->used;
1766 struct line *linelim = buffer_linelim (buf);
1767 struct line *line = linelim - buf->nlines;
1768 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1769 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1771 while (line_bytes + 1 < avail)
1773 /* Read as many bytes as possible, but do not read so many
1774 bytes that there might not be enough room for the
1775 corresponding line array. The worst case is when the
1776 rest of the input file consists entirely of newlines,
1777 except that the last byte is not a newline. */
1778 size_t readsize = (avail - 1) / (line_bytes + 1);
1779 size_t bytes_read = fread (ptr, 1, readsize, fp);
1780 char *ptrlim = ptr + bytes_read;
1781 char *p;
1782 avail -= bytes_read;
1784 if (bytes_read != readsize)
1786 if (ferror (fp))
1787 die (_("read failed"), file);
1788 if (feof (fp))
1790 buf->eof = true;
1791 if (buf->buf == ptrlim)
1792 return false;
1793 if (line_start != ptrlim && ptrlim[-1] != eol)
1794 *ptrlim++ = eol;
1798 /* Find and record each line in the just-read input. */
1799 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1801 /* Delimit the line with NUL. This eliminates the need to
1802 temporarily replace the last byte with NUL when calling
1803 xmemcoll(), which increases performance. */
1804 *p = '\0';
1805 ptr = p + 1;
1806 line--;
1807 line->text = line_start;
1808 line->length = ptr - line_start;
1809 mergesize = MAX (mergesize, line->length);
1810 avail -= line_bytes;
1812 if (key)
1814 /* Precompute the position of the first key for
1815 efficiency. */
1816 line->keylim = (key->eword == SIZE_MAX
1818 : limfield (line, key));
1820 if (key->sword != SIZE_MAX)
1821 line->keybeg = begfield (line, key);
1822 else
1824 if (key->skipsblanks)
1825 while (blanks[to_uchar (*line_start)])
1826 line_start++;
1827 line->keybeg = line_start;
1831 line_start = ptr;
1834 ptr = ptrlim;
1835 if (buf->eof)
1836 break;
1839 buf->used = ptr - buf->buf;
1840 buf->nlines = buffer_linelim (buf) - line;
1841 if (buf->nlines != 0)
1843 buf->left = ptr - line_start;
1844 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1845 return true;
1849 /* The current input line is too long to fit in the buffer.
1850 Increase the buffer size and try again, keeping it properly
1851 aligned. */
1852 size_t line_alloc = buf->alloc / sizeof (struct line);
1853 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1854 buf->alloc = line_alloc * sizeof (struct line);
1859 /* Table that maps characters to order-of-magnitude values. */
1860 static char const unit_order[UCHAR_LIM] =
1862 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1863 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1864 /* This initializer syntax works on all C99 hosts. For now, use
1865 it only on non-ASCII hosts, to ease the pain of porting to
1866 pre-C99 ASCII hosts. */
1867 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1868 ['k']=1,
1869 #else
1870 /* Generate the following table with this command:
1871 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1872 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1873 |fmt */
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1877 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1880 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1881 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1882 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1883 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1884 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1885 #endif
1888 /* Return an integer that represents the order of magnitude of the
1889 unit following the number. The number may contain thousands
1890 separators and a decimal point, but it may not contain leading blanks.
1891 Negative numbers get negative orders; zero numbers have a zero order. */
1893 static int _GL_ATTRIBUTE_PURE
1894 find_unit_order (char const *number)
1896 bool minus_sign = (*number == '-');
1897 char const *p = number + minus_sign;
1898 int nonzero = 0;
1899 unsigned char ch;
1901 /* Scan to end of number.
1902 Decimals or separators not followed by digits stop the scan.
1903 Numbers ending in decimals or separators are thus considered
1904 to be lacking in units.
1905 FIXME: add support for multibyte thousands_sep and decimal_point. */
1909 while (ISDIGIT (ch = *p++))
1910 nonzero |= ch - '0';
1912 while (ch == thousands_sep);
1914 if (ch == decimal_point)
1915 while (ISDIGIT (ch = *p++))
1916 nonzero |= ch - '0';
1918 if (nonzero)
1920 int order = unit_order[ch];
1921 return (minus_sign ? -order : order);
1923 else
1924 return 0;
1927 /* Compare numbers A and B ending in units with SI or IEC prefixes
1928 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1930 static int
1931 human_numcompare (char const *a, char const *b)
1933 while (blanks[to_uchar (*a)])
1934 a++;
1935 while (blanks[to_uchar (*b)])
1936 b++;
1938 int diff = find_unit_order (a) - find_unit_order (b);
1939 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1942 /* Compare strings A and B as numbers without explicitly converting them to
1943 machine numbers. Comparatively slow for short strings, but asymptotically
1944 hideously fast. */
1946 static int
1947 numcompare (char const *a, char const *b)
1949 while (blanks[to_uchar (*a)])
1950 a++;
1951 while (blanks[to_uchar (*b)])
1952 b++;
1954 return strnumcmp (a, b, decimal_point, thousands_sep);
1957 /* Work around a problem whereby the long double value returned by glibc's
1958 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1959 A and B before calling strtold. FIXME: remove this function once
1960 gnulib guarantees that strtold's result is always well defined. */
1961 static int
1962 nan_compare (char const *sa, char const *sb)
1964 long_double a;
1965 memset (&a, 0, sizeof a);
1966 a = strtold (sa, NULL);
1968 long_double b;
1969 memset (&b, 0, sizeof b);
1970 b = strtold (sb, NULL);
1972 return memcmp (&a, &b, sizeof a);
1975 static int
1976 general_numcompare (char const *sa, char const *sb)
1978 /* FIXME: maybe add option to try expensive FP conversion
1979 only if A and B can't be compared more cheaply/accurately. */
1981 char *ea;
1982 char *eb;
1983 long_double a = strtold (sa, &ea);
1984 long_double b = strtold (sb, &eb);
1986 /* Put conversion errors at the start of the collating sequence. */
1987 if (sa == ea)
1988 return sb == eb ? 0 : -1;
1989 if (sb == eb)
1990 return 1;
1992 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1993 conversion errors but before numbers; sort them by internal
1994 bit-pattern, for lack of a more portable alternative. */
1995 return (a < b ? -1
1996 : a > b ? 1
1997 : a == b ? 0
1998 : b == b ? -1
1999 : a == a ? 1
2000 : nan_compare (sa, sb));
2003 /* Return an integer in 1..12 of the month name MONTH.
2004 Return 0 if the name in S is not recognized. */
2006 static int
2007 getmonth (char const *month, char **ea)
2009 size_t lo = 0;
2010 size_t hi = MONTHS_PER_YEAR;
2012 while (blanks[to_uchar (*month)])
2013 month++;
2017 size_t ix = (lo + hi) / 2;
2018 char const *m = month;
2019 char const *n = monthtab[ix].name;
2021 for (;; m++, n++)
2023 if (!*n)
2025 if (ea)
2026 *ea = (char *) m;
2027 return monthtab[ix].val;
2029 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2031 hi = ix;
2032 break;
2034 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2036 lo = ix + 1;
2037 break;
2041 while (lo < hi);
2043 return 0;
2046 /* A randomly chosen MD5 state, used for random comparison. */
2047 static struct md5_ctx random_md5_state;
2049 /* Initialize the randomly chosen MD5 state. */
2051 static void
2052 random_md5_state_init (char const *random_source)
2054 unsigned char buf[MD5_DIGEST_SIZE];
2055 struct randread_source *r = randread_new (random_source, sizeof buf);
2056 if (! r)
2057 die (_("open failed"), random_source);
2058 randread (r, buf, sizeof buf);
2059 if (randread_free (r) != 0)
2060 die (_("close failed"), random_source);
2061 md5_init_ctx (&random_md5_state);
2062 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2065 /* This is like strxfrm, except it reports any error and exits. */
2067 static size_t
2068 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2070 errno = 0;
2071 size_t translated_size = strxfrm (dest, src, destsize);
2073 if (errno)
2075 error (0, errno, _("string transformation failed"));
2076 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2077 error (SORT_FAILURE, 0,
2078 _("the untransformed string was %s"),
2079 quotearg_n_style (0, locale_quoting_style, src));
2082 return translated_size;
2085 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2086 using one or more random hash functions. TEXTA[LENA] and
2087 TEXTB[LENB] must be zero. */
2089 static int
2090 compare_random (char *restrict texta, size_t lena,
2091 char *restrict textb, size_t lenb)
2093 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2094 data. This is used to break ties if there is a checksum
2095 collision, and this is good enough given the astronomically low
2096 probability of a collision. */
2097 int xfrm_diff = 0;
2099 char stackbuf[4000];
2100 char *buf = stackbuf;
2101 size_t bufsize = sizeof stackbuf;
2102 void *allocated = NULL;
2103 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2104 struct md5_ctx s[2];
2105 s[0] = s[1] = random_md5_state;
2107 if (hard_LC_COLLATE)
2109 char const *lima = texta + lena;
2110 char const *limb = textb + lenb;
2112 while (true)
2114 /* Transform the text into the basis of comparison, so that byte
2115 strings that would otherwise considered to be equal are
2116 considered equal here even if their bytes differ.
2118 Each time through this loop, transform one
2119 null-terminated string's worth from TEXTA or from TEXTB
2120 or both. That way, there's no need to store the
2121 transformation of the whole line, if it contains many
2122 null-terminated strings. */
2124 /* Store the transformed data into a big-enough buffer. */
2126 /* A 3X size guess avoids the overhead of calling strxfrm
2127 twice on typical implementations. Don't worry about
2128 size_t overflow, as the guess need not be correct. */
2129 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2130 if (bufsize < guess_bufsize)
2132 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2133 free (allocated);
2134 buf = allocated = malloc (bufsize);
2135 if (! buf)
2137 buf = stackbuf;
2138 bufsize = sizeof stackbuf;
2142 size_t sizea =
2143 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2144 bool a_fits = sizea <= bufsize;
2145 size_t sizeb =
2146 (textb < limb
2147 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2148 (a_fits ? bufsize - sizea : 0))
2149 + 1)
2150 : 0);
2152 if (! (a_fits && sizea + sizeb <= bufsize))
2154 bufsize = sizea + sizeb;
2155 if (bufsize < SIZE_MAX / 3)
2156 bufsize = bufsize * 3 / 2;
2157 free (allocated);
2158 buf = allocated = xmalloc (bufsize);
2159 if (texta < lima)
2160 strxfrm (buf, texta, sizea);
2161 if (textb < limb)
2162 strxfrm (buf + sizea, textb, sizeb);
2165 /* Advance past NULs to the next part of each input string,
2166 exiting the loop if both strings are exhausted. When
2167 exiting the loop, prepare to finish off the tiebreaker
2168 comparison properly. */
2169 if (texta < lima)
2170 texta += strlen (texta) + 1;
2171 if (textb < limb)
2172 textb += strlen (textb) + 1;
2173 if (! (texta < lima || textb < limb))
2175 lena = sizea; texta = buf;
2176 lenb = sizeb; textb = buf + sizea;
2177 break;
2180 /* Accumulate the transformed data in the corresponding
2181 checksums. */
2182 md5_process_bytes (buf, sizea, &s[0]);
2183 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2185 /* Update the tiebreaker comparison of the transformed data. */
2186 if (! xfrm_diff)
2188 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2189 if (! xfrm_diff)
2190 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2195 /* Compute and compare the checksums. */
2196 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2197 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2198 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2200 /* Fall back on the tiebreaker if the checksums collide. */
2201 if (! diff)
2203 if (! xfrm_diff)
2205 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2206 if (! xfrm_diff)
2207 xfrm_diff = (lena > lenb) - (lena < lenb);
2210 diff = xfrm_diff;
2213 free (allocated);
2215 return diff;
2218 /* Return the printable width of the block of memory starting at
2219 TEXT and ending just before LIM, counting each tab as one byte.
2220 FIXME: Should we generally be counting non printable chars? */
2222 static size_t
2223 debug_width (char const *text, char const *lim)
2225 size_t width = mbsnwidth (text, lim - text, 0);
2226 while (text < lim)
2227 width += (*text++ == '\t');
2228 return width;
2231 /* For debug mode, "underline" a key at the
2232 specified offset and screen width. */
2234 static void
2235 mark_key (size_t offset, size_t width)
2237 while (offset--)
2238 putchar (' ');
2240 if (!width)
2241 printf (_("^ no match for key\n"));
2242 else
2245 putchar ('_');
2246 while (--width);
2248 putchar ('\n');
2252 /* Return true if KEY is a numeric key. */
2254 static inline bool
2255 key_numeric (struct keyfield const *key)
2257 return key->numeric || key->general_numeric || key->human_numeric;
2260 /* For LINE, output a debugging line that underlines KEY in LINE.
2261 If KEY is null, underline the whole line. */
2263 static void
2264 debug_key (struct line const *line, struct keyfield const *key)
2266 char *text = line->text;
2267 char *beg = text;
2268 char *lim = text + line->length - 1;
2270 if (key)
2272 if (key->sword != SIZE_MAX)
2273 beg = begfield (line, key);
2274 if (key->eword != SIZE_MAX)
2275 lim = limfield (line, key);
2277 if (key->skipsblanks || key->month || key_numeric (key))
2279 char saved = *lim;
2280 *lim = '\0';
2282 while (blanks[to_uchar (*beg)])
2283 beg++;
2285 char *tighter_lim = beg;
2287 if (lim < beg)
2288 tighter_lim = lim;
2289 else if (key->month)
2290 getmonth (beg, &tighter_lim);
2291 else if (key->general_numeric)
2292 ignore_value (strtold (beg, &tighter_lim));
2293 else if (key->numeric || key->human_numeric)
2295 char *p = beg + (beg < lim && *beg == '-');
2296 bool found_digit = false;
2297 unsigned char ch;
2301 while (ISDIGIT (ch = *p++))
2302 found_digit = true;
2304 while (ch == thousands_sep);
2306 if (ch == decimal_point)
2307 while (ISDIGIT (ch = *p++))
2308 found_digit = true;
2310 if (found_digit)
2311 tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
2313 else
2314 tighter_lim = lim;
2316 *lim = saved;
2317 lim = tighter_lim;
2321 size_t offset = debug_width (text, beg);
2322 size_t width = debug_width (beg, lim);
2323 mark_key (offset, width);
2326 /* Debug LINE by underlining its keys. */
2328 static void
2329 debug_line (struct line const *line)
2331 struct keyfield const *key = keylist;
2334 debug_key (line, key);
2335 while (key && ((key = key->next) || ! (unique || stable)));
2338 /* Return whether sorting options specified for key. */
2340 static bool
2341 default_key_compare (struct keyfield const *key)
2343 return ! (key->ignore
2344 || key->translate
2345 || key->skipsblanks
2346 || key->skipeblanks
2347 || key_numeric (key)
2348 || key->month
2349 || key->version
2350 || key->random
2351 /* || key->reverse */
2355 /* Convert a key to the short options used to specify it. */
2357 static void
2358 key_to_opts (struct keyfield const *key, char *opts)
2360 if (key->skipsblanks || key->skipeblanks)
2361 *opts++ = 'b';/* either disables global -b */
2362 if (key->ignore == nondictionary)
2363 *opts++ = 'd';
2364 if (key->translate)
2365 *opts++ = 'f';
2366 if (key->general_numeric)
2367 *opts++ = 'g';
2368 if (key->human_numeric)
2369 *opts++ = 'h';
2370 if (key->ignore == nonprinting)
2371 *opts++ = 'i';
2372 if (key->month)
2373 *opts++ = 'M';
2374 if (key->numeric)
2375 *opts++ = 'n';
2376 if (key->random)
2377 *opts++ = 'R';
2378 if (key->reverse)
2379 *opts++ = 'r';
2380 if (key->version)
2381 *opts++ = 'V';
2382 *opts = '\0';
2385 /* Output data independent key warnings to stderr. */
2387 static void
2388 key_warnings (struct keyfield const *gkey, bool gkey_only)
2390 struct keyfield const *key;
2391 struct keyfield ugkey = *gkey;
2392 unsigned long keynum = 1;
2394 for (key = keylist; key; key = key->next, keynum++)
2396 if (key->obsolete_used)
2398 size_t sword = key->sword;
2399 size_t eword = key->eword;
2400 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2401 /* obsolescent syntax +A.x -B.y is equivalent to:
2402 -k A+1.x+1,B.y (when y = 0)
2403 -k A+1.x+1,B+1.y (when y > 0) */
2404 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2405 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2406 char *po = obuf;
2407 char *pn = nbuf;
2409 if (sword == SIZE_MAX)
2410 sword++;
2412 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2413 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2414 if (key->eword != SIZE_MAX)
2416 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2417 stpcpy (stpcpy (pn, ","),
2418 umaxtostr (eword + 1
2419 + (key->echar == SIZE_MAX), tmp));
2421 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2422 quote_n (0, obuf), quote_n (1, nbuf));
2425 /* Warn about field specs that will never match. */
2426 if (key->sword != SIZE_MAX && key->eword < key->sword)
2427 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2429 /* Warn about significant leading blanks. */
2430 bool implicit_skip = key_numeric (key) || key->month;
2431 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2432 && !(key->schar || key->echar);
2433 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2434 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2435 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2436 || (!key->skipsblanks && key->schar)
2437 || (!key->skipeblanks && key->echar)))
2438 error (0, 0, _("leading blanks are significant in key %lu; "
2439 "consider also specifying 'b'"), keynum);
2441 /* Warn about numeric comparisons spanning fields,
2442 as field delimiters could be interpreted as part
2443 of the number (maybe only in other locales). */
2444 if (!gkey_only && key_numeric (key))
2446 size_t sword = key->sword + 1;
2447 size_t eword = key->eword + 1;
2448 if (!sword)
2449 sword++;
2450 if (!eword || sword < eword)
2451 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2452 keynum);
2455 /* Flag global options not copied or specified in any key. */
2456 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2457 ugkey.ignore = NULL;
2458 if (ugkey.translate && (ugkey.translate == key->translate))
2459 ugkey.translate = NULL;
2460 ugkey.skipsblanks &= !key->skipsblanks;
2461 ugkey.skipeblanks &= !key->skipeblanks;
2462 ugkey.month &= !key->month;
2463 ugkey.numeric &= !key->numeric;
2464 ugkey.general_numeric &= !key->general_numeric;
2465 ugkey.human_numeric &= !key->human_numeric;
2466 ugkey.random &= !key->random;
2467 ugkey.version &= !key->version;
2468 ugkey.reverse &= !key->reverse;
2471 /* Warn about ignored global options flagged above.
2472 Note if gkey is the only one in the list, all flags are cleared. */
2473 if (!default_key_compare (&ugkey)
2474 || (ugkey.reverse && (stable || unique) && keylist))
2476 bool ugkey_reverse = ugkey.reverse;
2477 if (!(stable || unique))
2478 ugkey.reverse = false;
2479 /* The following is too big, but guaranteed to be "big enough". */
2480 char opts[sizeof short_options];
2481 key_to_opts (&ugkey, opts);
2482 error (0, 0,
2483 ngettext ("option '-%s' is ignored",
2484 "options '-%s' are ignored",
2485 select_plural (strlen (opts))), opts);
2486 ugkey.reverse = ugkey_reverse;
2488 if (ugkey.reverse && !(stable || unique) && keylist)
2489 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2492 /* Compare two lines A and B trying every key in sequence until there
2493 are no more keys or a difference is found. */
2495 static int
2496 keycompare (struct line const *a, struct line const *b)
2498 struct keyfield *key = keylist;
2500 /* For the first iteration only, the key positions have been
2501 precomputed for us. */
2502 char *texta = a->keybeg;
2503 char *textb = b->keybeg;
2504 char *lima = a->keylim;
2505 char *limb = b->keylim;
2507 int diff;
2509 while (true)
2511 char const *translate = key->translate;
2512 bool const *ignore = key->ignore;
2514 /* Treat field ends before field starts as empty fields. */
2515 lima = MAX (texta, lima);
2516 limb = MAX (textb, limb);
2518 /* Find the lengths. */
2519 size_t lena = lima - texta;
2520 size_t lenb = limb - textb;
2522 if (hard_LC_COLLATE || key_numeric (key)
2523 || key->month || key->random || key->version)
2525 char *ta;
2526 char *tb;
2527 size_t tlena;
2528 size_t tlenb;
2530 char enda IF_LINT (= 0);
2531 char endb IF_LINT (= 0);
2532 void *allocated IF_LINT (= NULL);
2533 char stackbuf[4000];
2535 if (ignore || translate)
2537 /* Compute with copies of the keys, which are the result of
2538 translating or ignoring characters, and which need their
2539 own storage. */
2541 size_t i;
2543 /* Allocate space for copies. */
2544 size_t size = lena + 1 + lenb + 1;
2545 if (size <= sizeof stackbuf)
2546 ta = stackbuf, allocated = NULL;
2547 else
2548 ta = allocated = xmalloc (size);
2549 tb = ta + lena + 1;
2551 /* Put into each copy a version of the key in which the
2552 requested characters are ignored or translated. */
2553 for (tlena = i = 0; i < lena; i++)
2554 if (! (ignore && ignore[to_uchar (texta[i])]))
2555 ta[tlena++] = (translate
2556 ? translate[to_uchar (texta[i])]
2557 : texta[i]);
2558 ta[tlena] = '\0';
2560 for (tlenb = i = 0; i < lenb; i++)
2561 if (! (ignore && ignore[to_uchar (textb[i])]))
2562 tb[tlenb++] = (translate
2563 ? translate[to_uchar (textb[i])]
2564 : textb[i]);
2565 tb[tlenb] = '\0';
2567 else
2569 /* Use the keys in-place, temporarily null-terminated. */
2570 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2571 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2574 if (key->numeric)
2575 diff = numcompare (ta, tb);
2576 else if (key->general_numeric)
2577 diff = general_numcompare (ta, tb);
2578 else if (key->human_numeric)
2579 diff = human_numcompare (ta, tb);
2580 else if (key->month)
2581 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2582 else if (key->random)
2583 diff = compare_random (ta, tlena, tb, tlenb);
2584 else if (key->version)
2585 diff = filevercmp (ta, tb);
2586 else
2588 /* Locale-dependent string sorting. This is slower than
2589 C-locale sorting, which is implemented below. */
2590 if (tlena == 0)
2591 diff = - NONZERO (tlenb);
2592 else if (tlenb == 0)
2593 diff = 1;
2594 else
2595 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2598 if (ignore || translate)
2599 free (allocated);
2600 else
2602 ta[tlena] = enda;
2603 tb[tlenb] = endb;
2606 else if (ignore)
2608 #define CMP_WITH_IGNORE(A, B) \
2609 do \
2611 while (true) \
2613 while (texta < lima && ignore[to_uchar (*texta)]) \
2614 ++texta; \
2615 while (textb < limb && ignore[to_uchar (*textb)]) \
2616 ++textb; \
2617 if (! (texta < lima && textb < limb)) \
2618 break; \
2619 diff = to_uchar (A) - to_uchar (B); \
2620 if (diff) \
2621 goto not_equal; \
2622 ++texta; \
2623 ++textb; \
2626 diff = (texta < lima) - (textb < limb); \
2628 while (0)
2630 if (translate)
2631 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2632 translate[to_uchar (*textb)]);
2633 else
2634 CMP_WITH_IGNORE (*texta, *textb);
2636 else if (lena == 0)
2637 diff = - NONZERO (lenb);
2638 else if (lenb == 0)
2639 goto greater;
2640 else
2642 if (translate)
2644 while (texta < lima && textb < limb)
2646 diff = (to_uchar (translate[to_uchar (*texta++)])
2647 - to_uchar (translate[to_uchar (*textb++)]));
2648 if (diff)
2649 goto not_equal;
2652 else
2654 diff = memcmp (texta, textb, MIN (lena, lenb));
2655 if (diff)
2656 goto not_equal;
2658 diff = lena < lenb ? -1 : lena != lenb;
2661 if (diff)
2662 goto not_equal;
2664 key = key->next;
2665 if (! key)
2666 break;
2668 /* Find the beginning and limit of the next field. */
2669 if (key->eword != SIZE_MAX)
2670 lima = limfield (a, key), limb = limfield (b, key);
2671 else
2672 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2674 if (key->sword != SIZE_MAX)
2675 texta = begfield (a, key), textb = begfield (b, key);
2676 else
2678 texta = a->text, textb = b->text;
2679 if (key->skipsblanks)
2681 while (texta < lima && blanks[to_uchar (*texta)])
2682 ++texta;
2683 while (textb < limb && blanks[to_uchar (*textb)])
2684 ++textb;
2689 return 0;
2691 greater:
2692 diff = 1;
2693 not_equal:
2694 return key->reverse ? -diff : diff;
2697 /* Compare two lines A and B, returning negative, zero, or positive
2698 depending on whether A compares less than, equal to, or greater than B. */
2700 static int
2701 compare (struct line const *a, struct line const *b)
2703 int diff;
2704 size_t alen, blen;
2706 /* First try to compare on the specified keys (if any).
2707 The only two cases with no key at all are unadorned sort,
2708 and unadorned sort -r. */
2709 if (keylist)
2711 diff = keycompare (a, b);
2712 if (diff || unique || stable)
2713 return diff;
2716 /* If the keys all compare equal (or no keys were specified)
2717 fall through to the default comparison. */
2718 alen = a->length - 1, blen = b->length - 1;
2720 if (alen == 0)
2721 diff = - NONZERO (blen);
2722 else if (blen == 0)
2723 diff = 1;
2724 else if (hard_LC_COLLATE)
2726 /* Note xmemcoll0 is a performance enhancement as
2727 it will not unconditionally write '\0' after the
2728 passed in buffers, which was seen to give around
2729 a 3% increase in performance for short lines. */
2730 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2732 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2733 diff = alen < blen ? -1 : alen != blen;
2735 return reverse ? -diff : diff;
2738 /* Write LINE to output stream FP; the output file's name is
2739 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2740 otherwise. If debugging is enabled and FP is standard output,
2741 append some debugging information. */
2743 static void
2744 write_line (struct line const *line, FILE *fp, char const *output_file)
2746 char *buf = line->text;
2747 size_t n_bytes = line->length;
2748 char *ebuf = buf + n_bytes;
2750 if (!output_file && debug)
2752 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2753 char const *c = buf;
2755 while (c < ebuf)
2757 char wc = *c++;
2758 if (wc == '\t')
2759 wc = '>';
2760 else if (c == ebuf)
2761 wc = '\n';
2762 if (fputc (wc, fp) == EOF)
2763 die (_("write failed"), output_file);
2766 debug_line (line);
2768 else
2770 ebuf[-1] = eolchar;
2771 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2772 die (_("write failed"), output_file);
2773 ebuf[-1] = '\0';
2777 /* Check that the lines read from FILE_NAME come in order. Return
2778 true if they are in order. If CHECKONLY == 'c', also print a
2779 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2780 they are not in order. */
2782 static bool
2783 check (char const *file_name, char checkonly)
2785 FILE *fp = xfopen (file_name, "r");
2786 struct buffer buf; /* Input buffer. */
2787 struct line temp; /* Copy of previous line. */
2788 size_t alloc = 0;
2789 uintmax_t line_number = 0;
2790 struct keyfield const *key = keylist;
2791 bool nonunique = ! unique;
2792 bool ordered = true;
2794 initbuf (&buf, sizeof (struct line),
2795 MAX (merge_buffer_size, sort_size));
2796 temp.text = NULL;
2798 while (fillbuf (&buf, fp, file_name))
2800 struct line const *line = buffer_linelim (&buf);
2801 struct line const *linebase = line - buf.nlines;
2803 /* Make sure the line saved from the old buffer contents is
2804 less than or equal to the first line of the new buffer. */
2805 if (alloc && nonunique <= compare (&temp, line - 1))
2807 found_disorder:
2809 if (checkonly == 'c')
2811 struct line const *disorder_line = line - 1;
2812 uintmax_t disorder_line_number =
2813 buffer_linelim (&buf) - disorder_line + line_number;
2814 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2815 fprintf (stderr, _("%s: %s:%s: disorder: "),
2816 program_name, file_name,
2817 umaxtostr (disorder_line_number, hr_buf));
2818 write_line (disorder_line, stderr, _("standard error"));
2821 ordered = false;
2822 break;
2826 /* Compare each line in the buffer with its successor. */
2827 while (linebase < --line)
2828 if (nonunique <= compare (line, line - 1))
2829 goto found_disorder;
2831 line_number += buf.nlines;
2833 /* Save the last line of the buffer. */
2834 if (alloc < line->length)
2838 alloc *= 2;
2839 if (! alloc)
2841 alloc = line->length;
2842 break;
2845 while (alloc < line->length);
2847 free (temp.text);
2848 temp.text = xmalloc (alloc);
2850 memcpy (temp.text, line->text, line->length);
2851 temp.length = line->length;
2852 if (key)
2854 temp.keybeg = temp.text + (line->keybeg - line->text);
2855 temp.keylim = temp.text + (line->keylim - line->text);
2859 xfclose (fp, file_name);
2860 free (buf.buf);
2861 free (temp.text);
2862 return ordered;
2865 /* Open FILES (there are NFILES of them) and store the resulting array
2866 of stream pointers into (*PFPS). Allocate the array. Return the
2867 number of successfully opened files, setting errno if this value is
2868 less than NFILES. */
2870 static size_t
2871 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2873 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2874 int i;
2876 /* Open as many input files as we can. */
2877 for (i = 0; i < nfiles; i++)
2879 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2880 ? open_temp (files[i].temp)
2881 : stream_open (files[i].name, "r"));
2882 if (!fps[i])
2883 break;
2886 return i;
2889 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2890 files (all of which are at the start of the FILES array), and
2891 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2892 FPS is the vector of open stream corresponding to the files.
2893 Close input and output streams before returning.
2894 OUTPUT_FILE gives the name of the output file. If it is NULL,
2895 the output file is standard output. */
2897 static void
2898 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2899 FILE *ofp, char const *output_file, FILE **fps)
2901 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2902 /* Input buffers for each file. */
2903 struct line saved; /* Saved line storage for unique check. */
2904 struct line const *savedline = NULL;
2905 /* &saved if there is a saved line. */
2906 size_t savealloc = 0; /* Size allocated for the saved line. */
2907 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2908 /* Current line in each line table. */
2909 struct line const **base = xnmalloc (nfiles, sizeof *base);
2910 /* Base of each line table. */
2911 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2912 /* Table representing a permutation of fps,
2913 such that cur[ord[0]] is the smallest line
2914 and will be next output. */
2915 size_t i;
2916 size_t j;
2917 size_t t;
2918 struct keyfield const *key = keylist;
2919 saved.text = NULL;
2921 /* Read initial lines from each input file. */
2922 for (i = 0; i < nfiles; )
2924 initbuf (&buffer[i], sizeof (struct line),
2925 MAX (merge_buffer_size, sort_size / nfiles));
2926 if (fillbuf (&buffer[i], fps[i], files[i].name))
2928 struct line const *linelim = buffer_linelim (&buffer[i]);
2929 cur[i] = linelim - 1;
2930 base[i] = linelim - buffer[i].nlines;
2931 i++;
2933 else
2935 /* fps[i] is empty; eliminate it from future consideration. */
2936 xfclose (fps[i], files[i].name);
2937 if (i < ntemps)
2939 ntemps--;
2940 zaptemp (files[i].name);
2942 free (buffer[i].buf);
2943 --nfiles;
2944 for (j = i; j < nfiles; ++j)
2946 files[j] = files[j + 1];
2947 fps[j] = fps[j + 1];
2952 /* Set up the ord table according to comparisons among input lines.
2953 Since this only reorders two items if one is strictly greater than
2954 the other, it is stable. */
2955 for (i = 0; i < nfiles; ++i)
2956 ord[i] = i;
2957 for (i = 1; i < nfiles; ++i)
2958 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2959 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2961 /* Repeatedly output the smallest line until no input remains. */
2962 while (nfiles)
2964 struct line const *smallest = cur[ord[0]];
2966 /* If uniquified output is turned on, output only the first of
2967 an identical series of lines. */
2968 if (unique)
2970 if (savedline && compare (savedline, smallest))
2972 savedline = NULL;
2973 write_line (&saved, ofp, output_file);
2975 if (!savedline)
2977 savedline = &saved;
2978 if (savealloc < smallest->length)
2981 if (! savealloc)
2983 savealloc = smallest->length;
2984 break;
2986 while ((savealloc *= 2) < smallest->length);
2988 free (saved.text);
2989 saved.text = xmalloc (savealloc);
2991 saved.length = smallest->length;
2992 memcpy (saved.text, smallest->text, saved.length);
2993 if (key)
2995 saved.keybeg =
2996 saved.text + (smallest->keybeg - smallest->text);
2997 saved.keylim =
2998 saved.text + (smallest->keylim - smallest->text);
3002 else
3003 write_line (smallest, ofp, output_file);
3005 /* Check if we need to read more lines into core. */
3006 if (base[ord[0]] < smallest)
3007 cur[ord[0]] = smallest - 1;
3008 else
3010 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
3012 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
3013 cur[ord[0]] = linelim - 1;
3014 base[ord[0]] = linelim - buffer[ord[0]].nlines;
3016 else
3018 /* We reached EOF on fps[ord[0]]. */
3019 for (i = 1; i < nfiles; ++i)
3020 if (ord[i] > ord[0])
3021 --ord[i];
3022 --nfiles;
3023 xfclose (fps[ord[0]], files[ord[0]].name);
3024 if (ord[0] < ntemps)
3026 ntemps--;
3027 zaptemp (files[ord[0]].name);
3029 free (buffer[ord[0]].buf);
3030 for (i = ord[0]; i < nfiles; ++i)
3032 fps[i] = fps[i + 1];
3033 files[i] = files[i + 1];
3034 buffer[i] = buffer[i + 1];
3035 cur[i] = cur[i + 1];
3036 base[i] = base[i + 1];
3038 for (i = 0; i < nfiles; ++i)
3039 ord[i] = ord[i + 1];
3040 continue;
3044 /* The new line just read in may be larger than other lines
3045 already in main memory; push it back in the queue until we
3046 encounter a line larger than it. Optimize for the common
3047 case where the new line is smallest. */
3049 size_t lo = 1;
3050 size_t hi = nfiles;
3051 size_t probe = lo;
3052 size_t ord0 = ord[0];
3053 size_t count_of_smaller_lines;
3055 while (lo < hi)
3057 int cmp = compare (cur[ord0], cur[ord[probe]]);
3058 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3059 hi = probe;
3060 else
3061 lo = probe + 1;
3062 probe = (lo + hi) / 2;
3065 count_of_smaller_lines = lo - 1;
3066 for (j = 0; j < count_of_smaller_lines; j++)
3067 ord[j] = ord[j + 1];
3068 ord[count_of_smaller_lines] = ord0;
3072 if (unique && savedline)
3074 write_line (&saved, ofp, output_file);
3075 free (saved.text);
3078 xfclose (ofp, output_file);
3079 free (fps);
3080 free (buffer);
3081 free (ord);
3082 free (base);
3083 free (cur);
3086 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3087 files (all of which are at the start of the FILES array), and
3088 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3089 Close input and output files before returning.
3090 OUTPUT_FILE gives the name of the output file.
3092 Return the number of files successfully merged. This number can be
3093 less than NFILES if we ran low on file descriptors, but in this
3094 case it is never less than 2. */
3096 static size_t
3097 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3098 FILE *ofp, char const *output_file)
3100 FILE **fps;
3101 size_t nopened = open_input_files (files, nfiles, &fps);
3102 if (nopened < nfiles && nopened < 2)
3103 die (_("open failed"), files[nopened].name);
3104 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3105 return nopened;
3108 /* Merge into T (of size NLINES) the two sorted arrays of lines
3109 LO (with NLINES / 2 members), and
3110 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3111 T and LO point just past their respective arrays, and the arrays
3112 are in reverse order. NLINES must be at least 2. */
3114 static void
3115 mergelines (struct line *restrict t, size_t nlines,
3116 struct line const *restrict lo)
3118 size_t nlo = nlines / 2;
3119 size_t nhi = nlines - nlo;
3120 struct line *hi = t - nlo;
3122 while (true)
3123 if (compare (lo - 1, hi - 1) <= 0)
3125 *--t = *--lo;
3126 if (! --nlo)
3128 /* HI must equal T now, and there is no need to copy from
3129 HI to T. */
3130 return;
3133 else
3135 *--t = *--hi;
3136 if (! --nhi)
3139 *--t = *--lo;
3140 while (--nlo);
3142 return;
3147 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3148 Do this all within one thread. NLINES must be at least 2.
3149 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3150 Otherwise the sort is in-place and TEMP is half-sized.
3151 The input and output arrays are in reverse order, and LINES and
3152 TEMP point just past the end of their respective arrays.
3154 Use a recursive divide-and-conquer algorithm, in the style
3155 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3156 the optimization suggested by exercise 5.2.4-10; this requires room
3157 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3158 writes that this memory optimization was originally published by
3159 D. A. Bell, Comp J. 1 (1958), 75. */
3161 static void
3162 sequential_sort (struct line *restrict lines, size_t nlines,
3163 struct line *restrict temp, bool to_temp)
3165 if (nlines == 2)
3167 /* Declare 'swap' as int, not bool, to work around a bug
3168 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3169 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3170 int swap = (0 < compare (&lines[-1], &lines[-2]));
3171 if (to_temp)
3173 temp[-1] = lines[-1 - swap];
3174 temp[-2] = lines[-2 + swap];
3176 else if (swap)
3178 temp[-1] = lines[-1];
3179 lines[-1] = lines[-2];
3180 lines[-2] = temp[-1];
3183 else
3185 size_t nlo = nlines / 2;
3186 size_t nhi = nlines - nlo;
3187 struct line *lo = lines;
3188 struct line *hi = lines - nlo;
3190 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3191 if (1 < nlo)
3192 sequential_sort (lo, nlo, temp, !to_temp);
3193 else if (!to_temp)
3194 temp[-1] = lo[-1];
3196 struct line *dest;
3197 struct line const *sorted_lo;
3198 if (to_temp)
3200 dest = temp;
3201 sorted_lo = lines;
3203 else
3205 dest = lines;
3206 sorted_lo = temp;
3208 mergelines (dest, nlines, sorted_lo);
3212 static struct merge_node *init_node (struct merge_node *restrict,
3213 struct merge_node *restrict,
3214 struct line *, size_t, size_t, bool);
3217 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3218 lines, with destination DEST. */
3219 static struct merge_node *
3220 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3222 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3224 struct merge_node *root = merge_tree;
3225 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3226 root->dest = NULL;
3227 root->nlo = root->nhi = nlines;
3228 root->parent = NULL;
3229 root->level = MERGE_END;
3230 root->queued = false;
3231 pthread_mutex_init (&root->lock, NULL);
3233 init_node (root, root + 1, dest, nthreads, nlines, false);
3234 return merge_tree;
3237 /* Destroy the merge tree. */
3238 static void
3239 merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
3241 size_t n_nodes = nthreads * 2;
3242 struct merge_node *node = merge_tree;
3244 while (n_nodes--)
3246 pthread_mutex_destroy (&node->lock);
3247 node++;
3250 free (merge_tree);
3253 /* Initialize a merge tree node and its descendants. The node's
3254 parent is PARENT. The node and its descendants are taken from the
3255 array of nodes NODE_POOL. Their destination starts at DEST; they
3256 will consume NTHREADS threads. The total number of sort lines is
3257 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3258 its parent. */
3260 static struct merge_node *
3261 init_node (struct merge_node *restrict parent,
3262 struct merge_node *restrict node_pool,
3263 struct line *dest, size_t nthreads,
3264 size_t total_lines, bool is_lo_child)
3266 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3267 size_t nlo = nlines / 2;
3268 size_t nhi = nlines - nlo;
3269 struct line *lo = dest - total_lines;
3270 struct line *hi = lo - nlo;
3271 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3273 struct merge_node *node = node_pool++;
3274 node->lo = node->end_lo = lo;
3275 node->hi = node->end_hi = hi;
3276 node->dest = parent_end;
3277 node->nlo = nlo;
3278 node->nhi = nhi;
3279 node->parent = parent;
3280 node->level = parent->level + 1;
3281 node->queued = false;
3282 pthread_mutex_init (&node->lock, NULL);
3284 if (nthreads > 1)
3286 size_t lo_threads = nthreads / 2;
3287 size_t hi_threads = nthreads - lo_threads;
3288 node->lo_child = node_pool;
3289 node_pool = init_node (node, node_pool, lo, lo_threads,
3290 total_lines, true);
3291 node->hi_child = node_pool;
3292 node_pool = init_node (node, node_pool, hi, hi_threads,
3293 total_lines, false);
3295 else
3297 node->lo_child = NULL;
3298 node->hi_child = NULL;
3300 return node_pool;
3304 /* Compare two merge nodes A and B for priority. */
3306 static int
3307 compare_nodes (void const *a, void const *b)
3309 struct merge_node const *nodea = a;
3310 struct merge_node const *nodeb = b;
3311 if (nodea->level == nodeb->level)
3312 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3313 return nodea->level < nodeb->level;
3316 /* Lock a merge tree NODE. */
3318 static inline void
3319 lock_node (struct merge_node *node)
3321 pthread_mutex_lock (&node->lock);
3324 /* Unlock a merge tree NODE. */
3326 static inline void
3327 unlock_node (struct merge_node *node)
3329 pthread_mutex_unlock (&node->lock);
3332 /* Destroy merge QUEUE. */
3334 static void
3335 queue_destroy (struct merge_node_queue *queue)
3337 heap_free (queue->priority_queue);
3338 pthread_cond_destroy (&queue->cond);
3339 pthread_mutex_destroy (&queue->mutex);
3342 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3343 NTHREADS threads. */
3345 static void
3346 queue_init (struct merge_node_queue *queue, size_t nthreads)
3348 /* Though it's highly unlikely all nodes are in the heap at the same
3349 time, the heap should accommodate all of them. Counting a NULL
3350 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3351 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3352 pthread_mutex_init (&queue->mutex, NULL);
3353 pthread_cond_init (&queue->cond, NULL);
3356 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3357 does not need to lock NODE. */
3359 static void
3360 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3362 pthread_mutex_lock (&queue->mutex);
3363 heap_insert (queue->priority_queue, node);
3364 node->queued = true;
3365 pthread_cond_signal (&queue->cond);
3366 pthread_mutex_unlock (&queue->mutex);
3369 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3371 static struct merge_node *
3372 queue_pop (struct merge_node_queue *queue)
3374 struct merge_node *node;
3375 pthread_mutex_lock (&queue->mutex);
3376 while (! (node = heap_remove_top (queue->priority_queue)))
3377 pthread_cond_wait (&queue->cond, &queue->mutex);
3378 pthread_mutex_unlock (&queue->mutex);
3379 lock_node (node);
3380 node->queued = false;
3381 return node;
3384 /* Output LINE to TFP, unless -u is specified and the line compares
3385 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3386 is null if TFP is standard output.
3388 This function does not save the line for comparison later, so it is
3389 appropriate only for internal sort. */
3391 static void
3392 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3394 if (unique)
3396 if (saved_line.text && ! compare (line, &saved_line))
3397 return;
3398 saved_line = *line;
3401 write_line (line, tfp, temp_output);
3404 /* Merge the lines currently available to a NODE in the binary
3405 merge tree. Merge a number of lines appropriate for this merge
3406 level, assuming TOTAL_LINES is the total number of lines.
3408 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3409 the name of TFP, or is null if TFP is standard output. */
3411 static void
3412 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3413 FILE *tfp, char const *temp_output)
3415 struct line *lo_orig = node->lo;
3416 struct line *hi_orig = node->hi;
3417 size_t to_merge = MAX_MERGE (total_lines, node->level);
3418 size_t merged_lo;
3419 size_t merged_hi;
3421 if (node->level > MERGE_ROOT)
3423 /* Merge to destination buffer. */
3424 struct line *dest = *node->dest;
3425 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3426 if (compare (node->lo - 1, node->hi - 1) <= 0)
3427 *--dest = *--node->lo;
3428 else
3429 *--dest = *--node->hi;
3431 merged_lo = lo_orig - node->lo;
3432 merged_hi = hi_orig - node->hi;
3434 if (node->nhi == merged_hi)
3435 while (node->lo != node->end_lo && to_merge--)
3436 *--dest = *--node->lo;
3437 else if (node->nlo == merged_lo)
3438 while (node->hi != node->end_hi && to_merge--)
3439 *--dest = *--node->hi;
3440 *node->dest = dest;
3442 else
3444 /* Merge directly to output. */
3445 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3447 if (compare (node->lo - 1, node->hi - 1) <= 0)
3448 write_unique (--node->lo, tfp, temp_output);
3449 else
3450 write_unique (--node->hi, tfp, temp_output);
3453 merged_lo = lo_orig - node->lo;
3454 merged_hi = hi_orig - node->hi;
3456 if (node->nhi == merged_hi)
3458 while (node->lo != node->end_lo && to_merge--)
3459 write_unique (--node->lo, tfp, temp_output);
3461 else if (node->nlo == merged_lo)
3463 while (node->hi != node->end_hi && to_merge--)
3464 write_unique (--node->hi, tfp, temp_output);
3468 /* Update NODE. */
3469 merged_lo = lo_orig - node->lo;
3470 merged_hi = hi_orig - node->hi;
3471 node->nlo -= merged_lo;
3472 node->nhi -= merged_hi;
3475 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3476 NODE's children has available lines and the other either has
3477 available lines or has exhausted its lines. */
3479 static void
3480 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3482 if (! node->queued)
3484 bool lo_avail = (node->lo - node->end_lo) != 0;
3485 bool hi_avail = (node->hi - node->end_hi) != 0;
3486 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3487 queue_insert (queue, node);
3491 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3493 static void
3494 queue_check_insert_parent (struct merge_node_queue *queue,
3495 struct merge_node *node)
3497 if (node->level > MERGE_ROOT)
3499 lock_node (node->parent);
3500 queue_check_insert (queue, node->parent);
3501 unlock_node (node->parent);
3503 else if (node->nlo + node->nhi == 0)
3505 /* If the MERGE_ROOT NODE has finished merging, insert the
3506 MERGE_END node. */
3507 queue_insert (queue, node->parent);
3511 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3512 some of those lines, until the MERGE_END node is popped.
3513 TOTAL_LINES is the total number of lines. If merging at the top
3514 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3515 null if TFP is standard output. */
3517 static void
3518 merge_loop (struct merge_node_queue *queue,
3519 size_t total_lines, FILE *tfp, char const *temp_output)
3521 while (1)
3523 struct merge_node *node = queue_pop (queue);
3525 if (node->level == MERGE_END)
3527 unlock_node (node);
3528 /* Reinsert so other threads can pop it. */
3529 queue_insert (queue, node);
3530 break;
3532 mergelines_node (node, total_lines, tfp, temp_output);
3533 queue_check_insert (queue, node);
3534 queue_check_insert_parent (queue, node);
3536 unlock_node (node);
3541 static void sortlines (struct line *restrict, size_t, size_t,
3542 struct merge_node *, struct merge_node_queue *,
3543 FILE *, char const *);
3545 /* Thread arguments for sortlines_thread. */
3547 struct thread_args
3549 /* Source, i.e., the array of lines to sort. This points just past
3550 the end of the array. */
3551 struct line *lines;
3553 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3554 size_t nthreads;
3556 /* Number of lines in LINES and DEST. */
3557 size_t const total_lines;
3559 /* Merge node. Lines from this node and this node's sibling will merged
3560 to this node's parent. */
3561 struct merge_node *const node;
3563 /* The priority queue controlling available work for the entire
3564 internal sort. */
3565 struct merge_node_queue *const queue;
3567 /* If at the top level, the file to output to, and the file's name.
3568 If the file is standard output, the file's name is null. */
3569 FILE *tfp;
3570 char const *output_temp;
3573 /* Like sortlines, except with a signature acceptable to pthread_create. */
3575 static void *
3576 sortlines_thread (void *data)
3578 struct thread_args const *args = data;
3579 sortlines (args->lines, args->nthreads, args->total_lines,
3580 args->node, args->queue, args->tfp,
3581 args->output_temp);
3582 return NULL;
3585 /* Sort lines, possibly in parallel. The arguments are as in struct
3586 thread_args above.
3588 The algorithm has three phases: node creation, sequential sort,
3589 and binary merge.
3591 During node creation, sortlines recursively visits each node in the
3592 binary merge tree and creates a NODE structure corresponding to all the
3593 future line merging NODE is responsible for. For each call to
3594 sortlines, half the available threads are assigned to each recursive
3595 call, until a leaf node having only 1 available thread is reached.
3597 Each leaf node then performs two sequential sorts, one on each half of
3598 the lines it is responsible for. It records in its NODE structure that
3599 there are two sorted sublists available to merge from, and inserts its
3600 NODE into the priority queue.
3602 The binary merge phase then begins. Each thread drops into a loop
3603 where the thread retrieves a NODE from the priority queue, merges lines
3604 available to that NODE, and potentially insert NODE or its parent back
3605 into the queue if there are sufficient available lines for them to
3606 merge. This continues until all lines at all nodes of the merge tree
3607 have been merged. */
3609 static void
3610 sortlines (struct line *restrict lines, size_t nthreads,
3611 size_t total_lines, struct merge_node *node,
3612 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3614 size_t nlines = node->nlo + node->nhi;
3616 /* Calculate thread arguments. */
3617 size_t lo_threads = nthreads / 2;
3618 size_t hi_threads = nthreads - lo_threads;
3619 pthread_t thread;
3620 struct thread_args args = {lines, lo_threads, total_lines,
3621 node->lo_child, queue, tfp, temp_output};
3623 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3624 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3626 sortlines (lines - node->nlo, hi_threads, total_lines,
3627 node->hi_child, queue, tfp, temp_output);
3628 pthread_join (thread, NULL);
3630 else
3632 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3633 Sort with 1 thread. */
3634 size_t nlo = node->nlo;
3635 size_t nhi = node->nhi;
3636 struct line *temp = lines - total_lines;
3637 if (1 < nhi)
3638 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3639 if (1 < nlo)
3640 sequential_sort (lines, nlo, temp, false);
3642 /* Update merge NODE. No need to lock yet. */
3643 node->lo = lines;
3644 node->hi = lines - nlo;
3645 node->end_lo = lines - nlo;
3646 node->end_hi = lines - nlo - nhi;
3648 queue_insert (queue, node);
3649 merge_loop (queue, total_lines, tfp, temp_output);
3653 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3654 the same as OUTFILE. If found, replace each with the same
3655 temporary copy that can be merged into OUTFILE without destroying
3656 OUTFILE before it is completely read. This temporary copy does not
3657 count as a merge temp, so don't worry about incrementing NTEMPS in
3658 the caller; final cleanup will remove it, not zaptemp.
3660 This test ensures that an otherwise-erroneous use like
3661 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3662 It's not clear that POSIX requires this nicety.
3663 Detect common error cases, but don't try to catch obscure cases like
3664 "cat ... FILE ... | sort -m -o FILE"
3665 where traditional "sort" doesn't copy the input and where
3666 people should know that they're getting into trouble anyway.
3667 Catching these obscure cases would slow down performance in
3668 common cases. */
3670 static void
3671 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3672 size_t nfiles, char const *outfile)
3674 size_t i;
3675 bool got_outstat = false;
3676 struct stat outstat;
3677 struct tempnode *tempcopy = NULL;
3679 for (i = ntemps; i < nfiles; i++)
3681 bool is_stdin = STREQ (files[i].name, "-");
3682 bool same;
3683 struct stat instat;
3685 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3686 same = true;
3687 else
3689 if (! got_outstat)
3691 if (fstat (STDOUT_FILENO, &outstat) != 0)
3692 break;
3693 got_outstat = true;
3696 same = (((is_stdin
3697 ? fstat (STDIN_FILENO, &instat)
3698 : stat (files[i].name, &instat))
3699 == 0)
3700 && SAME_INODE (instat, outstat));
3703 if (same)
3705 if (! tempcopy)
3707 FILE *tftp;
3708 tempcopy = create_temp (&tftp);
3709 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3712 files[i].name = tempcopy->name;
3713 files[i].temp = tempcopy;
3718 /* Scan the input files to ensure all are accessible.
3719 Otherwise exit with a diagnostic.
3721 Note this will catch common issues with permissions etc.
3722 but will fail to notice issues where you can open() but not read(),
3723 like when a directory is specified on some systems.
3724 Catching these obscure cases could slow down performance in
3725 common cases. */
3727 static void
3728 check_inputs (char *const *files, size_t nfiles)
3730 size_t i;
3731 for (i = 0; i < nfiles; i++)
3733 if (STREQ (files[i], "-"))
3734 continue;
3736 if (euidaccess (files[i], R_OK) != 0)
3737 die (_("cannot read"), files[i]);
3741 /* Ensure a specified output file can be created or written to,
3742 and point stdout to it. Do not truncate the file.
3743 Exit with a diagnostic on failure. */
3745 static void
3746 check_output (char const *outfile)
3748 if (outfile)
3750 int outfd = open (outfile, O_WRONLY | O_CREAT | O_BINARY, MODE_RW_UGO);
3751 if (outfd < 0)
3752 die (_("open failed"), outfile);
3753 move_fd_or_die (outfd, STDOUT_FILENO);
3757 /* Merge the input FILES. NTEMPS is the number of files at the
3758 start of FILES that are temporary; it is zero at the top level.
3759 NFILES is the total number of files. Put the output in
3760 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3762 static void
3763 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3764 char const *output_file)
3766 while (nmerge < nfiles)
3768 /* Number of input files processed so far. */
3769 size_t in;
3771 /* Number of output files generated so far. */
3772 size_t out;
3774 /* nfiles % NMERGE; this counts input files that are left over
3775 after all full-sized merges have been done. */
3776 size_t remainder;
3778 /* Number of easily-available slots at the next loop iteration. */
3779 size_t cheap_slots;
3781 /* Do as many NMERGE-size merges as possible. In the case that
3782 nmerge is bogus, increment by the maximum number of file
3783 descriptors allowed. */
3784 for (out = in = 0; nmerge <= nfiles - in; out++)
3786 FILE *tfp;
3787 struct tempnode *temp = create_temp (&tfp);
3788 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3789 nmerge, tfp, temp->name);
3790 ntemps -= MIN (ntemps, num_merged);
3791 files[out].name = temp->name;
3792 files[out].temp = temp;
3793 in += num_merged;
3796 remainder = nfiles - in;
3797 cheap_slots = nmerge - out % nmerge;
3799 if (cheap_slots < remainder)
3801 /* So many files remain that they can't all be put into the last
3802 NMERGE-sized output window. Do one more merge. Merge as few
3803 files as possible, to avoid needless I/O. */
3804 size_t nshortmerge = remainder - cheap_slots + 1;
3805 FILE *tfp;
3806 struct tempnode *temp = create_temp (&tfp);
3807 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3808 nshortmerge, tfp, temp->name);
3809 ntemps -= MIN (ntemps, num_merged);
3810 files[out].name = temp->name;
3811 files[out++].temp = temp;
3812 in += num_merged;
3815 /* Put the remaining input files into the last NMERGE-sized output
3816 window, so they will be merged in the next pass. */
3817 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3818 ntemps += out;
3819 nfiles -= in - out;
3822 avoid_trashing_input (files, ntemps, nfiles, output_file);
3824 /* We aren't guaranteed that this final mergefiles will work, therefore we
3825 try to merge into the output, and then merge as much as we can into a
3826 temp file if we can't. Repeat. */
3828 while (true)
3830 /* Merge directly into the output file if possible. */
3831 FILE **fps;
3832 size_t nopened = open_input_files (files, nfiles, &fps);
3834 if (nopened == nfiles)
3836 FILE *ofp = stream_open (output_file, "w");
3837 if (ofp)
3839 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3840 break;
3842 if (errno != EMFILE || nopened <= 2)
3843 die (_("open failed"), output_file);
3845 else if (nopened <= 2)
3846 die (_("open failed"), files[nopened].name);
3848 /* We ran out of file descriptors. Close one of the input
3849 files, to gain a file descriptor. Then create a temporary
3850 file with our spare file descriptor. Retry if that failed
3851 (e.g., some other process could open a file between the time
3852 we closed and tried to create). */
3853 FILE *tfp;
3854 struct tempnode *temp;
3857 nopened--;
3858 xfclose (fps[nopened], files[nopened].name);
3859 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3861 while (!temp);
3863 /* Merge into the newly allocated temporary. */
3864 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3865 fps);
3866 ntemps -= MIN (ntemps, nopened);
3867 files[0].name = temp->name;
3868 files[0].temp = temp;
3870 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3871 ntemps++;
3872 nfiles -= nopened - 1;
3876 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3878 static void
3879 sort (char *const *files, size_t nfiles, char const *output_file,
3880 size_t nthreads)
3882 struct buffer buf;
3883 IF_LINT (buf.buf = NULL);
3884 size_t ntemps = 0;
3885 bool output_file_created = false;
3887 buf.alloc = 0;
3889 while (nfiles)
3891 char const *temp_output;
3892 char const *file = *files;
3893 FILE *fp = xfopen (file, "r");
3894 FILE *tfp;
3896 size_t bytes_per_line;
3897 if (nthreads > 1)
3899 /* Get log P. */
3900 size_t tmp = 1;
3901 size_t mult = 1;
3902 while (tmp < nthreads)
3904 tmp *= 2;
3905 mult++;
3907 bytes_per_line = (mult * sizeof (struct line));
3909 else
3910 bytes_per_line = sizeof (struct line) * 3 / 2;
3912 if (! buf.alloc)
3913 initbuf (&buf, bytes_per_line,
3914 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3915 buf.eof = false;
3916 files++;
3917 nfiles--;
3919 while (fillbuf (&buf, fp, file))
3921 struct line *line;
3923 if (buf.eof && nfiles
3924 && (bytes_per_line + 1
3925 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3927 /* End of file, but there is more input and buffer room.
3928 Concatenate the next input file; this is faster in
3929 the usual case. */
3930 buf.left = buf.used;
3931 break;
3934 saved_line.text = NULL;
3935 line = buffer_linelim (&buf);
3936 if (buf.eof && !nfiles && !ntemps && !buf.left)
3938 xfclose (fp, file);
3939 tfp = xfopen (output_file, "w");
3940 temp_output = output_file;
3941 output_file_created = true;
3943 else
3945 ++ntemps;
3946 temp_output = create_temp (&tfp)->name;
3948 if (1 < buf.nlines)
3950 struct merge_node_queue queue;
3951 queue_init (&queue, nthreads);
3952 struct merge_node *merge_tree =
3953 merge_tree_init (nthreads, buf.nlines, line);
3955 sortlines (line, nthreads, buf.nlines, merge_tree + 1,
3956 &queue, tfp, temp_output);
3958 #ifdef lint
3959 merge_tree_destroy (nthreads, merge_tree);
3960 queue_destroy (&queue);
3961 #endif
3963 else
3964 write_unique (line - 1, tfp, temp_output);
3966 xfclose (tfp, temp_output);
3968 if (output_file_created)
3969 goto finish;
3971 xfclose (fp, file);
3974 finish:
3975 free (buf.buf);
3977 if (! output_file_created)
3979 size_t i;
3980 struct tempnode *node = temphead;
3981 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3982 for (i = 0; node; i++)
3984 tempfiles[i].name = node->name;
3985 tempfiles[i].temp = node;
3986 node = node->next;
3988 merge (tempfiles, ntemps, ntemps, output_file);
3989 free (tempfiles);
3992 reap_all ();
3995 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3997 static void
3998 insertkey (struct keyfield *key_arg)
4000 struct keyfield **p;
4001 struct keyfield *key = xmemdup (key_arg, sizeof *key);
4003 for (p = &keylist; *p; p = &(*p)->next)
4004 continue;
4005 *p = key;
4006 key->next = NULL;
4009 /* Report a bad field specification SPEC, with extra info MSGID. */
4011 static void badfieldspec (char const *, char const *)
4012 ATTRIBUTE_NORETURN;
4013 static void
4014 badfieldspec (char const *spec, char const *msgid)
4016 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
4017 _(msgid), quote (spec));
4018 abort ();
4021 /* Report incompatible options. */
4023 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
4024 static void
4025 incompatible_options (char const *opts)
4027 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
4028 abort ();
4031 /* Check compatibility of ordering options. */
4033 static void
4034 check_ordering_compatibility (void)
4036 struct keyfield *key;
4038 for (key = keylist; key; key = key->next)
4039 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4040 + key->month + (key->version | key->random | !!key->ignore)))
4042 /* The following is too big, but guaranteed to be "big enough". */
4043 char opts[sizeof short_options];
4044 /* Clear flags we're not interested in. */
4045 key->skipsblanks = key->skipeblanks = key->reverse = false;
4046 key_to_opts (key, opts);
4047 incompatible_options (opts);
4051 /* Parse the leading integer in STRING and store the resulting value
4052 (which must fit into size_t) into *VAL. Return the address of the
4053 suffix after the integer. If the value is too large, silently
4054 substitute SIZE_MAX. If MSGID is NULL, return NULL after
4055 failure; otherwise, report MSGID and exit on failure. */
4057 static char const *
4058 parse_field_count (char const *string, size_t *val, char const *msgid)
4060 char *suffix;
4061 uintmax_t n;
4063 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4065 case LONGINT_OK:
4066 case LONGINT_INVALID_SUFFIX_CHAR:
4067 *val = n;
4068 if (*val == n)
4069 break;
4070 /* Fall through. */
4071 case LONGINT_OVERFLOW:
4072 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4073 *val = SIZE_MAX;
4074 break;
4076 case LONGINT_INVALID:
4077 if (msgid)
4078 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4079 _(msgid), quote (string));
4080 return NULL;
4083 return suffix;
4086 /* Handle interrupts and hangups. */
4088 static void
4089 sighandler (int sig)
4091 if (! SA_NOCLDSTOP)
4092 signal (sig, SIG_IGN);
4094 cleanup ();
4096 signal (sig, SIG_DFL);
4097 raise (sig);
4100 /* Set the ordering options for KEY specified in S.
4101 Return the address of the first character in S that
4102 is not a valid ordering option.
4103 BLANKTYPE is the kind of blanks that 'b' should skip. */
4105 static char *
4106 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4108 while (*s)
4110 switch (*s)
4112 case 'b':
4113 if (blanktype == bl_start || blanktype == bl_both)
4114 key->skipsblanks = true;
4115 if (blanktype == bl_end || blanktype == bl_both)
4116 key->skipeblanks = true;
4117 break;
4118 case 'd':
4119 key->ignore = nondictionary;
4120 break;
4121 case 'f':
4122 key->translate = fold_toupper;
4123 break;
4124 case 'g':
4125 key->general_numeric = true;
4126 break;
4127 case 'h':
4128 key->human_numeric = true;
4129 break;
4130 case 'i':
4131 /* Option order should not matter, so don't let -i override
4132 -d. -d implies -i, but -i does not imply -d. */
4133 if (! key->ignore)
4134 key->ignore = nonprinting;
4135 break;
4136 case 'M':
4137 key->month = true;
4138 break;
4139 case 'n':
4140 key->numeric = true;
4141 break;
4142 case 'R':
4143 key->random = true;
4144 break;
4145 case 'r':
4146 key->reverse = true;
4147 break;
4148 case 'V':
4149 key->version = true;
4150 break;
4151 default:
4152 return (char *) s;
4154 ++s;
4156 return (char *) s;
4159 /* Initialize KEY. */
4161 static struct keyfield *
4162 key_init (struct keyfield *key)
4164 memset (key, 0, sizeof *key);
4165 key->eword = SIZE_MAX;
4166 return key;
4170 main (int argc, char **argv)
4172 struct keyfield *key;
4173 struct keyfield key_buf;
4174 struct keyfield gkey;
4175 bool gkey_only = false;
4176 char const *s;
4177 int c = 0;
4178 char checkonly = 0;
4179 bool mergeonly = false;
4180 char *random_source = NULL;
4181 bool need_random = false;
4182 size_t nthreads = 0;
4183 size_t nfiles = 0;
4184 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4185 bool obsolete_usage = (posix2_version () < 200112);
4186 char **files;
4187 char *files_from = NULL;
4188 struct Tokens tok;
4189 char const *outfile = NULL;
4190 bool locale_ok;
4192 initialize_main (&argc, &argv);
4193 set_program_name (argv[0]);
4194 locale_ok = setlocale (LC_ALL, "");
4195 bindtextdomain (PACKAGE, LOCALEDIR);
4196 textdomain (PACKAGE);
4198 initialize_exit_failure (SORT_FAILURE);
4200 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4201 #if HAVE_NL_LANGINFO
4202 hard_LC_TIME = hard_locale (LC_TIME);
4203 #endif
4205 /* Get locale's representation of the decimal point. */
4207 struct lconv const *locale = localeconv ();
4209 /* If the locale doesn't define a decimal point, or if the decimal
4210 point is multibyte, use the C locale's decimal point. FIXME:
4211 add support for multibyte decimal points. */
4212 decimal_point = to_uchar (locale->decimal_point[0]);
4213 if (! decimal_point || locale->decimal_point[1])
4214 decimal_point = '.';
4216 /* FIXME: add support for multibyte thousands separators. */
4217 thousands_sep = to_uchar (*locale->thousands_sep);
4218 if (! thousands_sep || locale->thousands_sep[1])
4219 thousands_sep = -1;
4222 have_read_stdin = false;
4223 inittables ();
4226 size_t i;
4227 static int const sig[] =
4229 /* The usual suspects. */
4230 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4231 #ifdef SIGPOLL
4232 SIGPOLL,
4233 #endif
4234 #ifdef SIGPROF
4235 SIGPROF,
4236 #endif
4237 #ifdef SIGVTALRM
4238 SIGVTALRM,
4239 #endif
4240 #ifdef SIGXCPU
4241 SIGXCPU,
4242 #endif
4243 #ifdef SIGXFSZ
4244 SIGXFSZ,
4245 #endif
4247 enum { nsigs = ARRAY_CARDINALITY (sig) };
4249 #if SA_NOCLDSTOP
4250 struct sigaction act;
4252 sigemptyset (&caught_signals);
4253 for (i = 0; i < nsigs; i++)
4255 sigaction (sig[i], NULL, &act);
4256 if (act.sa_handler != SIG_IGN)
4257 sigaddset (&caught_signals, sig[i]);
4260 act.sa_handler = sighandler;
4261 act.sa_mask = caught_signals;
4262 act.sa_flags = 0;
4264 for (i = 0; i < nsigs; i++)
4265 if (sigismember (&caught_signals, sig[i]))
4266 sigaction (sig[i], &act, NULL);
4267 #else
4268 for (i = 0; i < nsigs; i++)
4269 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4271 signal (sig[i], sighandler);
4272 siginterrupt (sig[i], 1);
4274 #endif
4276 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4278 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4279 atexit (exit_cleanup);
4281 key_init (&gkey);
4282 gkey.sword = SIZE_MAX;
4284 files = xnmalloc (argc, sizeof *files);
4286 while (true)
4288 /* Parse an operand as a file after "--" was seen; or if
4289 pedantic and a file was seen, unless the POSIX version
4290 predates 1003.1-2001 and -c was not seen and the operand is
4291 "-o FILE" or "-oFILE". */
4292 int oi = -1;
4294 if (c == -1
4295 || (posixly_correct && nfiles != 0
4296 && ! (obsolete_usage
4297 && ! checkonly
4298 && optind != argc
4299 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4300 && (argv[optind][2] || optind + 1 != argc)))
4301 || ((c = getopt_long (argc, argv, short_options,
4302 long_options, &oi))
4303 == -1))
4305 if (argc <= optind)
4306 break;
4307 files[nfiles++] = argv[optind++];
4309 else switch (c)
4311 case 1:
4312 key = NULL;
4313 if (optarg[0] == '+')
4315 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4316 && ISDIGIT (argv[optind][1]));
4317 obsolete_usage |= minus_pos_usage && !posixly_correct;
4318 if (obsolete_usage)
4320 /* Treat +POS1 [-POS2] as a key if possible; but silently
4321 treat an operand as a file if it is not a valid +POS1. */
4322 key = key_init (&key_buf);
4323 s = parse_field_count (optarg + 1, &key->sword, NULL);
4324 if (s && *s == '.')
4325 s = parse_field_count (s + 1, &key->schar, NULL);
4326 if (! (key->sword || key->schar))
4327 key->sword = SIZE_MAX;
4328 if (! s || *set_ordering (s, key, bl_start))
4329 key = NULL;
4330 else
4332 if (minus_pos_usage)
4334 char const *optarg1 = argv[optind++];
4335 s = parse_field_count (optarg1 + 1, &key->eword,
4336 N_("invalid number after '-'"));
4337 /* When called with a non-NULL message ID,
4338 parse_field_count cannot return NULL. Tell static
4339 analysis tools that dereferencing S is safe. */
4340 assert (s);
4341 if (*s == '.')
4342 s = parse_field_count (s + 1, &key->echar,
4343 N_("invalid number after '.'"));
4344 if (!key->echar && key->eword)
4346 /* obsolescent syntax +A.x -B.y is equivalent to:
4347 -k A+1.x+1,B.y (when y = 0)
4348 -k A+1.x+1,B+1.y (when y > 0)
4349 So eword is decremented as in the -k case
4350 only when the end field (B) is specified and
4351 echar (y) is 0. */
4352 key->eword--;
4354 if (*set_ordering (s, key, bl_end))
4355 badfieldspec (optarg1,
4356 N_("stray character in field spec"));
4358 key->obsolete_used = true;
4359 insertkey (key);
4363 if (! key)
4364 files[nfiles++] = optarg;
4365 break;
4367 case SORT_OPTION:
4368 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4369 /* Fall through. */
4370 case 'b':
4371 case 'd':
4372 case 'f':
4373 case 'g':
4374 case 'h':
4375 case 'i':
4376 case 'M':
4377 case 'n':
4378 case 'r':
4379 case 'R':
4380 case 'V':
4382 char str[2];
4383 str[0] = c;
4384 str[1] = '\0';
4385 set_ordering (str, &gkey, bl_both);
4387 break;
4389 case CHECK_OPTION:
4390 c = (optarg
4391 ? XARGMATCH ("--check", optarg, check_args, check_types)
4392 : 'c');
4393 /* Fall through. */
4394 case 'c':
4395 case 'C':
4396 if (checkonly && checkonly != c)
4397 incompatible_options ("cC");
4398 checkonly = c;
4399 break;
4401 case COMPRESS_PROGRAM_OPTION:
4402 if (compress_program && !STREQ (compress_program, optarg))
4403 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4404 compress_program = optarg;
4405 break;
4407 case DEBUG_PROGRAM_OPTION:
4408 debug = true;
4409 break;
4411 case FILES0_FROM_OPTION:
4412 files_from = optarg;
4413 break;
4415 case 'k':
4416 key = key_init (&key_buf);
4418 /* Get POS1. */
4419 s = parse_field_count (optarg, &key->sword,
4420 N_("invalid number at field start"));
4421 if (! key->sword--)
4423 /* Provoke with 'sort -k0' */
4424 badfieldspec (optarg, N_("field number is zero"));
4426 if (*s == '.')
4428 s = parse_field_count (s + 1, &key->schar,
4429 N_("invalid number after '.'"));
4430 if (! key->schar--)
4432 /* Provoke with 'sort -k1.0' */
4433 badfieldspec (optarg, N_("character offset is zero"));
4436 if (! (key->sword || key->schar))
4437 key->sword = SIZE_MAX;
4438 s = set_ordering (s, key, bl_start);
4439 if (*s != ',')
4441 key->eword = SIZE_MAX;
4442 key->echar = 0;
4444 else
4446 /* Get POS2. */
4447 s = parse_field_count (s + 1, &key->eword,
4448 N_("invalid number after ','"));
4449 if (! key->eword--)
4451 /* Provoke with 'sort -k1,0' */
4452 badfieldspec (optarg, N_("field number is zero"));
4454 if (*s == '.')
4456 s = parse_field_count (s + 1, &key->echar,
4457 N_("invalid number after '.'"));
4459 s = set_ordering (s, key, bl_end);
4461 if (*s)
4462 badfieldspec (optarg, N_("stray character in field spec"));
4463 insertkey (key);
4464 break;
4466 case 'm':
4467 mergeonly = true;
4468 break;
4470 case NMERGE_OPTION:
4471 specify_nmerge (oi, c, optarg);
4472 break;
4474 case 'o':
4475 if (outfile && !STREQ (outfile, optarg))
4476 error (SORT_FAILURE, 0, _("multiple output files specified"));
4477 outfile = optarg;
4478 break;
4480 case RANDOM_SOURCE_OPTION:
4481 if (random_source && !STREQ (random_source, optarg))
4482 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4483 random_source = optarg;
4484 break;
4486 case 's':
4487 stable = true;
4488 break;
4490 case 'S':
4491 specify_sort_size (oi, c, optarg);
4492 break;
4494 case 't':
4496 char newtab = optarg[0];
4497 if (! newtab)
4498 error (SORT_FAILURE, 0, _("empty tab"));
4499 if (optarg[1])
4501 if (STREQ (optarg, "\\0"))
4502 newtab = '\0';
4503 else
4505 /* Provoke with 'sort -txx'. Complain about
4506 "multi-character tab" instead of "multibyte tab", so
4507 that the diagnostic's wording does not need to be
4508 changed once multibyte characters are supported. */
4509 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4510 quote (optarg));
4513 if (tab != TAB_DEFAULT && tab != newtab)
4514 error (SORT_FAILURE, 0, _("incompatible tabs"));
4515 tab = newtab;
4517 break;
4519 case 'T':
4520 add_temp_dir (optarg);
4521 break;
4523 case PARALLEL_OPTION:
4524 nthreads = specify_nthreads (oi, c, optarg);
4525 break;
4527 case 'u':
4528 unique = true;
4529 break;
4531 case 'y':
4532 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4533 through Solaris 7. It is also accepted by many non-Solaris
4534 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4535 -y is marked as obsolete starting with Solaris 8 (1999), but is
4536 still accepted as of Solaris 10 prerelease (2004).
4538 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4539 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4540 and which in general ignores the argument after "-y" if it
4541 consists entirely of digits (it can even be empty). */
4542 if (optarg == argv[optind - 1])
4544 char const *p;
4545 for (p = optarg; ISDIGIT (*p); p++)
4546 continue;
4547 optind -= (*p != '\0');
4549 break;
4551 case 'z':
4552 eolchar = 0;
4553 break;
4555 case_GETOPT_HELP_CHAR;
4557 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4559 default:
4560 usage (SORT_FAILURE);
4564 if (files_from)
4566 FILE *stream;
4568 /* When using --files0-from=F, you may not specify any files
4569 on the command-line. */
4570 if (nfiles)
4572 error (0, 0, _("extra operand %s"), quoteaf (files[0]));
4573 fprintf (stderr, "%s\n",
4574 _("file operands cannot be combined with --files0-from"));
4575 usage (SORT_FAILURE);
4578 if (STREQ (files_from, "-"))
4579 stream = stdin;
4580 else
4582 stream = fopen (files_from, "r");
4583 if (stream == NULL)
4584 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4585 quoteaf (files_from));
4588 readtokens0_init (&tok);
4590 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4591 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4592 quoteaf (files_from));
4594 if (tok.n_tok)
4596 size_t i;
4597 free (files);
4598 files = tok.tok;
4599 nfiles = tok.n_tok;
4600 for (i = 0; i < nfiles; i++)
4602 if (STREQ (files[i], "-"))
4603 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4604 "no file name of %s allowed"),
4605 quoteaf (files[i]));
4606 else if (files[i][0] == '\0')
4608 /* Using the standard 'filename:line-number:' prefix here is
4609 not totally appropriate, since NUL is the separator,
4610 not NL, but it might be better than nothing. */
4611 unsigned long int file_number = i + 1;
4612 error (SORT_FAILURE, 0,
4613 _("%s:%lu: invalid zero-length file name"),
4614 quotef (files_from), file_number);
4618 else
4619 error (SORT_FAILURE, 0, _("no input from %s"),
4620 quoteaf (files_from));
4623 /* Inheritance of global options to individual keys. */
4624 for (key = keylist; key; key = key->next)
4626 if (default_key_compare (key) && !key->reverse)
4628 key->ignore = gkey.ignore;
4629 key->translate = gkey.translate;
4630 key->skipsblanks = gkey.skipsblanks;
4631 key->skipeblanks = gkey.skipeblanks;
4632 key->month = gkey.month;
4633 key->numeric = gkey.numeric;
4634 key->general_numeric = gkey.general_numeric;
4635 key->human_numeric = gkey.human_numeric;
4636 key->version = gkey.version;
4637 key->random = gkey.random;
4638 key->reverse = gkey.reverse;
4641 need_random |= key->random;
4644 if (!keylist && !default_key_compare (&gkey))
4646 gkey_only = true;
4647 insertkey (&gkey);
4648 need_random |= gkey.random;
4651 check_ordering_compatibility ();
4653 if (debug)
4655 if (checkonly || outfile)
4657 static char opts[] = "X --debug";
4658 opts[0] = (checkonly ? checkonly : 'o');
4659 incompatible_options (opts);
4662 /* Always output the locale in debug mode, since this
4663 is such a common source of confusion. */
4664 if (hard_LC_COLLATE)
4665 error (0, 0, _("using %s sorting rules"),
4666 quote (setlocale (LC_COLLATE, NULL)));
4667 else
4669 error (0, 0, "%s%s", locale_ok ? "" : _("failed to set locale; "),
4670 _("using simple byte comparison"));
4672 key_warnings (&gkey, gkey_only);
4675 reverse = gkey.reverse;
4677 if (need_random)
4678 random_md5_state_init (random_source);
4680 if (temp_dir_count == 0)
4682 char const *tmp_dir = getenv ("TMPDIR");
4683 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4686 if (nfiles == 0)
4688 nfiles = 1;
4689 free (files);
4690 files = xmalloc (sizeof *files);
4691 *files = (char *) "-";
4694 /* Need to re-check that we meet the minimum requirement for memory
4695 usage with the final value for NMERGE. */
4696 if (0 < sort_size)
4697 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4699 if (checkonly)
4701 if (nfiles > 1)
4702 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4703 quoteaf (files[1]), checkonly);
4705 if (outfile)
4707 static char opts[] = {0, 'o', 0};
4708 opts[0] = checkonly;
4709 incompatible_options (opts);
4712 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4713 input is not properly sorted. */
4714 return check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER;
4717 /* Check all inputs are accessible, or exit immediately. */
4718 check_inputs (files, nfiles);
4720 /* Check output is writable, or exit immediately. */
4721 check_output (outfile);
4723 if (mergeonly)
4725 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4726 size_t i;
4728 for (i = 0; i < nfiles; ++i)
4729 sortfiles[i].name = files[i];
4731 merge (sortfiles, 0, nfiles, outfile);
4732 IF_LINT (free (sortfiles));
4734 else
4736 if (!nthreads)
4738 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4739 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4742 /* Avoid integer overflow later. */
4743 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4744 nthreads = MIN (nthreads, nthreads_max);
4746 sort (files, nfiles, outfile, nthreads);
4749 #ifdef lint
4750 if (files_from)
4751 readtokens0_free (&tok);
4752 else
4753 free (files);
4754 #endif
4756 if (have_read_stdin && fclose (stdin) == EOF)
4757 die (_("close failed"), "-");
4759 return EXIT_SUCCESS;