maint: sort: style adjustment to help clarify size determination
[coreutils.git] / src / sort.c
blob5a48ce6f0ca79d357c50b6b49b90401f8252dfd4
1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
23 #include <config.h>
25 #include <getopt.h>
26 #include <pthread.h>
27 #include <sys/resource.h>
28 #include <sys/types.h>
29 #include <sys/wait.h>
30 #include <signal.h>
31 #include <assert.h>
32 #include "system.h"
33 #include "argmatch.h"
34 #include "error.h"
35 #include "fadvise.h"
36 #include "filevercmp.h"
37 #include "hard-locale.h"
38 #include "hash.h"
39 #include "heap.h"
40 #include "ignore-value.h"
41 #include "md5.h"
42 #include "mbswidth.h"
43 #include "nproc.h"
44 #include "physmem.h"
45 #include "posixver.h"
46 #include "quote.h"
47 #include "quotearg.h"
48 #include "randread.h"
49 #include "readtokens0.h"
50 #include "stdio--.h"
51 #include "stdlib--.h"
52 #include "strnumcmp.h"
53 #include "xmemcoll.h"
54 #include "xnanosleep.h"
55 #include "xstrtol.h"
57 #ifndef RLIMIT_DATA
58 struct rlimit { size_t rlim_cur; };
59 # define getrlimit(Resource, Rlp) (-1)
60 #endif
62 /* The official name of this program (e.g., no 'g' prefix). */
63 #define PROGRAM_NAME "sort"
65 #define AUTHORS \
66 proper_name ("Mike Haertel"), \
67 proper_name ("Paul Eggert")
69 #if HAVE_LANGINFO_CODESET
70 # include <langinfo.h>
71 #endif
73 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
74 present. */
75 #ifndef SA_NOCLDSTOP
76 # define SA_NOCLDSTOP 0
77 /* No sigprocmask. Always 'return' zero. */
78 # define sigprocmask(How, Set, Oset) (0)
79 # define sigset_t int
80 # if ! HAVE_SIGINTERRUPT
81 # define siginterrupt(sig, flag) /* empty */
82 # endif
83 #endif
85 #if !defined OPEN_MAX && defined NR_OPEN
86 # define OPEN_MAX NR_OPEN
87 #endif
88 #if !defined OPEN_MAX
89 # define OPEN_MAX 20
90 #endif
92 #define UCHAR_LIM (UCHAR_MAX + 1)
94 #if HAVE_C99_STRTOLD
95 # define long_double long double
96 #else
97 # define long_double double
98 # undef strtold
99 # define strtold strtod
100 #endif
102 #ifndef DEFAULT_TMPDIR
103 # define DEFAULT_TMPDIR "/tmp"
104 #endif
106 /* Maximum number of lines to merge every time a NODE is taken from
107 the merge queue. Node is at LEVEL in the binary merge tree,
108 and is responsible for merging TOTAL lines. */
109 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
111 /* Heuristic value for the number of lines for which it is worth creating
112 a subthread, during an internal merge sort. I.e., it is a small number
113 of "average" lines for which sorting via two threads is faster than
114 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
115 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
116 lines of gensort -a output is sorted slightly faster with --parallel=2
117 than with --parallel=1. By contrast, using --parallel=1 is about 10%
118 faster than using --parallel=2 with a 64K-line input. */
119 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
120 verify (4 <= SUBTHREAD_LINES_HEURISTIC);
122 /* The number of threads after which there are
123 diminishing performance gains. */
124 enum { DEFAULT_MAX_THREADS = 8 };
126 /* Exit statuses. */
127 enum
129 /* POSIX says to exit with status 1 if invoked with -c and the
130 input is not properly sorted. */
131 SORT_OUT_OF_ORDER = 1,
133 /* POSIX says any other irregular exit must exit with a status
134 code greater than 1. */
135 SORT_FAILURE = 2
138 enum
140 /* The number of times we should try to fork a compression process
141 (we retry if the fork call fails). We don't _need_ to compress
142 temp files, this is just to reduce disk access, so this number
143 can be small. Each retry doubles in duration. */
144 MAX_FORK_TRIES_COMPRESS = 4,
146 /* The number of times we should try to fork a decompression process.
147 If we can't fork a decompression process, we can't sort, so this
148 number should be big. Each retry doubles in duration. */
149 MAX_FORK_TRIES_DECOMPRESS = 9
152 enum
154 /* Level of the end-of-merge node, one level above the root. */
155 MERGE_END = 0,
157 /* Level of the root node in merge tree. */
158 MERGE_ROOT = 1
161 /* The representation of the decimal point in the current locale. */
162 static int decimal_point;
164 /* Thousands separator; if -1, then there isn't one. */
165 static int thousands_sep;
167 /* Nonzero if the corresponding locales are hard. */
168 static bool hard_LC_COLLATE;
169 #if HAVE_NL_LANGINFO
170 static bool hard_LC_TIME;
171 #endif
173 #define NONZERO(x) ((x) != 0)
175 /* The kind of blanks for '-b' to skip in various options. */
176 enum blanktype { bl_start, bl_end, bl_both };
178 /* The character marking end of line. Default to \n. */
179 static char eolchar = '\n';
181 /* Lines are held in core as counted strings. */
182 struct line
184 char *text; /* Text of the line. */
185 size_t length; /* Length including final newline. */
186 char *keybeg; /* Start of first key. */
187 char *keylim; /* Limit of first key. */
190 /* Input buffers. */
191 struct buffer
193 char *buf; /* Dynamically allocated buffer,
194 partitioned into 3 regions:
195 - input data;
196 - unused area;
197 - an array of lines, in reverse order. */
198 size_t used; /* Number of bytes used for input data. */
199 size_t nlines; /* Number of lines in the line array. */
200 size_t alloc; /* Number of bytes allocated. */
201 size_t left; /* Number of bytes left from previous reads. */
202 size_t line_bytes; /* Number of bytes to reserve for each line. */
203 bool eof; /* An EOF has been read. */
206 /* Sort key. */
207 struct keyfield
209 size_t sword; /* Zero-origin 'word' to start at. */
210 size_t schar; /* Additional characters to skip. */
211 size_t eword; /* Zero-origin last 'word' of key. */
212 size_t echar; /* Additional characters in field. */
213 bool const *ignore; /* Boolean array of characters to ignore. */
214 char const *translate; /* Translation applied to characters. */
215 bool skipsblanks; /* Skip leading blanks when finding start. */
216 bool skipeblanks; /* Skip leading blanks when finding end. */
217 bool numeric; /* Flag for numeric comparison. Handle
218 strings of digits with optional decimal
219 point, but no exponential notation. */
220 bool random; /* Sort by random hash of key. */
221 bool general_numeric; /* Flag for general, numeric comparison.
222 Handle numbers in exponential notation. */
223 bool human_numeric; /* Flag for sorting by human readable
224 units with either SI xor IEC prefixes. */
225 bool month; /* Flag for comparison by month name. */
226 bool reverse; /* Reverse the sense of comparison. */
227 bool version; /* sort by version number */
228 bool obsolete_used; /* obsolescent key option format is used. */
229 struct keyfield *next; /* Next keyfield to try. */
232 struct month
234 char const *name;
235 int val;
238 /* Binary merge tree node. */
239 struct merge_node
241 struct line *lo; /* Lines to merge from LO child node. */
242 struct line *hi; /* Lines to merge from HI child ndoe. */
243 struct line *end_lo; /* End of available lines from LO. */
244 struct line *end_hi; /* End of available lines from HI. */
245 struct line **dest; /* Pointer to destination of merge. */
246 size_t nlo; /* Total Lines remaining from LO. */
247 size_t nhi; /* Total lines remaining from HI. */
248 struct merge_node *parent; /* Parent node. */
249 struct merge_node *lo_child; /* LO child node. */
250 struct merge_node *hi_child; /* HI child node. */
251 unsigned int level; /* Level in merge tree. */
252 bool queued; /* Node is already in heap. */
253 pthread_mutex_t lock; /* Lock for node operations. */
256 /* Priority queue of merge nodes. */
257 struct merge_node_queue
259 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
260 pthread_mutex_t mutex; /* Lock for queue operations. */
261 pthread_cond_t cond; /* Conditional wait for empty queue to populate
262 when popping. */
265 /* FIXME: None of these tables work with multibyte character sets.
266 Also, there are many other bugs when handling multibyte characters.
267 One way to fix this is to rewrite 'sort' to use wide characters
268 internally, but doing this with good performance is a bit
269 tricky. */
271 /* Table of blanks. */
272 static bool blanks[UCHAR_LIM];
274 /* Table of non-printing characters. */
275 static bool nonprinting[UCHAR_LIM];
277 /* Table of non-dictionary characters (not letters, digits, or blanks). */
278 static bool nondictionary[UCHAR_LIM];
280 /* Translation table folding lower case to upper. */
281 static char fold_toupper[UCHAR_LIM];
283 #define MONTHS_PER_YEAR 12
285 /* Table mapping month names to integers.
286 Alphabetic order allows binary search. */
287 static struct month monthtab[] =
289 {"APR", 4},
290 {"AUG", 8},
291 {"DEC", 12},
292 {"FEB", 2},
293 {"JAN", 1},
294 {"JUL", 7},
295 {"JUN", 6},
296 {"MAR", 3},
297 {"MAY", 5},
298 {"NOV", 11},
299 {"OCT", 10},
300 {"SEP", 9}
303 /* During the merge phase, the number of files to merge at once. */
304 #define NMERGE_DEFAULT 16
306 /* Minimum size for a merge or check buffer. */
307 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
309 /* Minimum sort size; the code might not work with smaller sizes. */
310 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
312 /* The number of bytes needed for a merge or check buffer, which can
313 function relatively efficiently even if it holds only one line. If
314 a longer line is seen, this value is increased. */
315 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
317 /* The approximate maximum number of bytes of main memory to use, as
318 specified by the user. Zero if the user has not specified a size. */
319 static size_t sort_size;
321 /* The initial allocation factor for non-regular files.
322 This is used, e.g., when reading from a pipe.
323 Don't make it too big, since it is multiplied by ~130 to
324 obtain the size of the actual buffer sort will allocate.
325 Also, there may be 8 threads all doing this at the same time. */
326 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
328 /* Array of directory names in which any temporary files are to be created. */
329 static char const **temp_dirs;
331 /* Number of temporary directory names used. */
332 static size_t temp_dir_count;
334 /* Number of allocated slots in temp_dirs. */
335 static size_t temp_dir_alloc;
337 /* Flag to reverse the order of all comparisons. */
338 static bool reverse;
340 /* Flag for stable sort. This turns off the last ditch bytewise
341 comparison of lines, and instead leaves lines in the same order
342 they were read if all keys compare equal. */
343 static bool stable;
345 /* If TAB has this value, blanks separate fields. */
346 enum { TAB_DEFAULT = CHAR_MAX + 1 };
348 /* Tab character separating fields. If TAB_DEFAULT, then fields are
349 separated by the empty string between a non-blank character and a blank
350 character. */
351 static int tab = TAB_DEFAULT;
353 /* Flag to remove consecutive duplicate lines from the output.
354 Only the last of a sequence of equal lines will be output. */
355 static bool unique;
357 /* Nonzero if any of the input files are the standard input. */
358 static bool have_read_stdin;
360 /* List of key field comparisons to be tried. */
361 static struct keyfield *keylist;
363 /* Program used to (de)compress temp files. Must accept -d. */
364 static char const *compress_program;
366 /* Annotate the output with extra info to aid the user. */
367 static bool debug;
369 /* Maximum number of files to merge in one go. If more than this
370 number are present, temp files will be used. */
371 static unsigned int nmerge = NMERGE_DEFAULT;
373 /* Report MESSAGE for FILE, then clean up and exit.
374 If FILE is null, it represents standard output. */
376 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
377 static void
378 die (char const *message, char const *file)
380 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
381 exit (SORT_FAILURE);
384 void
385 usage (int status)
387 if (status != EXIT_SUCCESS)
388 emit_try_help ();
389 else
391 printf (_("\
392 Usage: %s [OPTION]... [FILE]...\n\
393 or: %s [OPTION]... --files0-from=F\n\
395 program_name, program_name);
396 fputs (_("\
397 Write sorted concatenation of all FILE(s) to standard output.\n\
399 "), stdout);
400 fputs (_("\
401 Mandatory arguments to long options are mandatory for short options too.\n\
402 "), stdout);
403 fputs (_("\
404 Ordering options:\n\
406 "), stdout);
407 fputs (_("\
408 -b, --ignore-leading-blanks ignore leading blanks\n\
409 -d, --dictionary-order consider only blanks and alphanumeric characters\
411 -f, --ignore-case fold lower case to upper case characters\n\
412 "), stdout);
413 fputs (_("\
414 -g, --general-numeric-sort compare according to general numerical value\n\
415 -i, --ignore-nonprinting consider only printable characters\n\
416 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
417 "), stdout);
418 fputs (_("\
419 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
420 "), stdout);
421 fputs (_("\
422 -n, --numeric-sort compare according to string numerical value\n\
423 -R, --random-sort sort by random hash of keys\n\
424 --random-source=FILE get random bytes from FILE\n\
425 -r, --reverse reverse the result of comparisons\n\
426 "), stdout);
427 fputs (_("\
428 --sort=WORD sort according to WORD:\n\
429 general-numeric -g, human-numeric -h, month -M,\
431 numeric -n, random -R, version -V\n\
432 -V, --version-sort natural sort of (version) numbers within text\n\
434 "), stdout);
435 fputs (_("\
436 Other options:\n\
438 "), stdout);
439 fputs (_("\
440 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
441 for more use temp files\n\
442 "), stdout);
443 fputs (_("\
444 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
445 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
447 --compress-program=PROG compress temporaries with PROG;\n\
448 decompress them with PROG -d\n\
449 "), stdout);
450 fputs (_("\
451 --debug annotate the part of the line used to sort,\n\
452 and warn about questionable usage to stderr\n\
453 --files0-from=F read input from the files specified by\n\
454 NUL-terminated names in file F;\n\
455 If F is - then read names from standard input\n\
456 "), stdout);
457 fputs (_("\
458 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
459 -m, --merge merge already sorted files; do not sort\n\
460 "), stdout);
461 fputs (_("\
462 -o, --output=FILE write result to FILE instead of standard output\n\
463 -s, --stable stabilize sort by disabling last-resort comparison\
465 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
466 "), stdout);
467 printf (_("\
468 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
469 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
470 multiple options specify multiple directories\n\
471 --parallel=N change the number of sorts run concurrently to N\n\
472 -u, --unique with -c, check for strict ordering;\n\
473 without -c, output only the first of an equal run\
475 "), DEFAULT_TMPDIR);
476 fputs (_("\
477 -z, --zero-terminated end lines with 0 byte, not newline\n\
478 "), stdout);
479 fputs (HELP_OPTION_DESCRIPTION, stdout);
480 fputs (VERSION_OPTION_DESCRIPTION, stdout);
481 fputs (_("\
483 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
484 field number and C a character position in the field; both are origin 1, and\n\
485 the stop position defaults to the line's end. If neither -t nor -b is in\n\
486 effect, characters in a field are counted from the beginning of the preceding\n\
487 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
489 which override global ordering options for that key. If no key is given, use\n\
490 the entire line as the key.\n\
492 SIZE may be followed by the following multiplicative suffixes:\n\
493 "), stdout);
494 fputs (_("\
495 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
497 With no FILE, or when FILE is -, read standard input.\n\
499 *** WARNING ***\n\
500 The locale specified by the environment affects sort order.\n\
501 Set LC_ALL=C to get the traditional sort order that uses\n\
502 native byte values.\n\
503 "), stdout );
504 emit_ancillary_info ();
507 exit (status);
510 /* For long options that have no equivalent short option, use a
511 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
512 enum
514 CHECK_OPTION = CHAR_MAX + 1,
515 COMPRESS_PROGRAM_OPTION,
516 DEBUG_PROGRAM_OPTION,
517 FILES0_FROM_OPTION,
518 NMERGE_OPTION,
519 RANDOM_SOURCE_OPTION,
520 SORT_OPTION,
521 PARALLEL_OPTION
524 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
526 static struct option const long_options[] =
528 {"ignore-leading-blanks", no_argument, NULL, 'b'},
529 {"check", optional_argument, NULL, CHECK_OPTION},
530 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
531 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
532 {"dictionary-order", no_argument, NULL, 'd'},
533 {"ignore-case", no_argument, NULL, 'f'},
534 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
535 {"general-numeric-sort", no_argument, NULL, 'g'},
536 {"ignore-nonprinting", no_argument, NULL, 'i'},
537 {"key", required_argument, NULL, 'k'},
538 {"merge", no_argument, NULL, 'm'},
539 {"month-sort", no_argument, NULL, 'M'},
540 {"numeric-sort", no_argument, NULL, 'n'},
541 {"human-numeric-sort", no_argument, NULL, 'h'},
542 {"version-sort", no_argument, NULL, 'V'},
543 {"random-sort", no_argument, NULL, 'R'},
544 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
545 {"sort", required_argument, NULL, SORT_OPTION},
546 {"output", required_argument, NULL, 'o'},
547 {"reverse", no_argument, NULL, 'r'},
548 {"stable", no_argument, NULL, 's'},
549 {"batch-size", required_argument, NULL, NMERGE_OPTION},
550 {"buffer-size", required_argument, NULL, 'S'},
551 {"field-separator", required_argument, NULL, 't'},
552 {"temporary-directory", required_argument, NULL, 'T'},
553 {"unique", no_argument, NULL, 'u'},
554 {"zero-terminated", no_argument, NULL, 'z'},
555 {"parallel", required_argument, NULL, PARALLEL_OPTION},
556 {GETOPT_HELP_OPTION_DECL},
557 {GETOPT_VERSION_OPTION_DECL},
558 {NULL, 0, NULL, 0},
561 #define CHECK_TABLE \
562 _ct_("quiet", 'C') \
563 _ct_("silent", 'C') \
564 _ct_("diagnose-first", 'c')
566 static char const *const check_args[] =
568 #define _ct_(_s, _c) _s,
569 CHECK_TABLE NULL
570 #undef _ct_
572 static char const check_types[] =
574 #define _ct_(_s, _c) _c,
575 CHECK_TABLE
576 #undef _ct_
579 #define SORT_TABLE \
580 _st_("general-numeric", 'g') \
581 _st_("human-numeric", 'h') \
582 _st_("month", 'M') \
583 _st_("numeric", 'n') \
584 _st_("random", 'R') \
585 _st_("version", 'V')
587 static char const *const sort_args[] =
589 #define _st_(_s, _c) _s,
590 SORT_TABLE NULL
591 #undef _st_
593 static char const sort_types[] =
595 #define _st_(_s, _c) _c,
596 SORT_TABLE
597 #undef _st_
600 /* The set of signals that are caught. */
601 static sigset_t caught_signals;
603 /* Critical section status. */
604 struct cs_status
606 bool valid;
607 sigset_t sigs;
610 /* Enter a critical section. */
611 static struct cs_status
612 cs_enter (void)
614 struct cs_status status;
615 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
616 return status;
619 /* Leave a critical section. */
620 static void
621 cs_leave (struct cs_status status)
623 if (status.valid)
625 /* Ignore failure when restoring the signal mask. */
626 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
630 /* Possible states for a temp file. If compressed, the file's status
631 is unreaped or reaped, depending on whether 'sort' has waited for
632 the subprocess to finish. */
633 enum { UNCOMPRESSED, UNREAPED, REAPED };
635 /* The list of temporary files. */
636 struct tempnode
638 struct tempnode *volatile next;
639 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
640 char state;
641 char name[1]; /* Actual size is 1 + file name length. */
643 static struct tempnode *volatile temphead;
644 static struct tempnode *volatile *temptail = &temphead;
646 /* A file to be sorted. */
647 struct sortfile
649 /* The file's name. */
650 char const *name;
652 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
653 struct tempnode *temp;
656 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
657 static Hash_table *proctab;
659 enum { INIT_PROCTAB_SIZE = 47 };
661 static size_t
662 proctab_hasher (void const *entry, size_t tabsize)
664 struct tempnode const *node = entry;
665 return node->pid % tabsize;
668 static bool
669 proctab_comparator (void const *e1, void const *e2)
671 struct tempnode const *n1 = e1;
672 struct tempnode const *n2 = e2;
673 return n1->pid == n2->pid;
676 /* The number of unreaped child processes. */
677 static pid_t nprocs;
679 static bool delete_proc (pid_t);
681 /* If PID is positive, wait for the child process with that PID to
682 exit, and assume that PID has already been removed from the process
683 table. If PID is 0 or -1, clean up some child that has exited (by
684 waiting for it, and removing it from the proc table) and return the
685 child's process ID. However, if PID is 0 and no children have
686 exited, return 0 without waiting. */
688 static pid_t
689 reap (pid_t pid)
691 int status;
692 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
694 if (cpid < 0)
695 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
696 compress_program);
697 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
699 if (! WIFEXITED (status) || WEXITSTATUS (status))
700 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
701 compress_program);
702 --nprocs;
705 return cpid;
708 /* TEMP represents a new process; add it to the process table. Create
709 the process table the first time it's called. */
711 static void
712 register_proc (struct tempnode *temp)
714 if (! proctab)
716 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
717 proctab_hasher,
718 proctab_comparator,
719 NULL);
720 if (! proctab)
721 xalloc_die ();
724 temp->state = UNREAPED;
726 if (! hash_insert (proctab, temp))
727 xalloc_die ();
730 /* If PID is in the process table, remove it and return true.
731 Otherwise, return false. */
733 static bool
734 delete_proc (pid_t pid)
736 struct tempnode test;
738 test.pid = pid;
739 struct tempnode *node = hash_delete (proctab, &test);
740 if (! node)
741 return false;
742 node->state = REAPED;
743 return true;
746 /* Remove PID from the process table, and wait for it to exit if it
747 hasn't already. */
749 static void
750 wait_proc (pid_t pid)
752 if (delete_proc (pid))
753 reap (pid);
756 /* Reap any exited children. Do not block; reap only those that have
757 already exited. */
759 static void
760 reap_exited (void)
762 while (0 < nprocs && reap (0))
763 continue;
766 /* Reap at least one exited child, waiting if necessary. */
768 static void
769 reap_some (void)
771 reap (-1);
772 reap_exited ();
775 /* Reap all children, waiting if necessary. */
777 static void
778 reap_all (void)
780 while (0 < nprocs)
781 reap (-1);
784 /* Clean up any remaining temporary files. */
786 static void
787 cleanup (void)
789 struct tempnode const *node;
791 for (node = temphead; node; node = node->next)
792 unlink (node->name);
793 temphead = NULL;
796 /* Cleanup actions to take when exiting. */
798 static void
799 exit_cleanup (void)
801 if (temphead)
803 /* Clean up any remaining temporary files in a critical section so
804 that a signal handler does not try to clean them too. */
805 struct cs_status cs = cs_enter ();
806 cleanup ();
807 cs_leave (cs);
810 close_stdout ();
813 /* Create a new temporary file, returning its newly allocated tempnode.
814 Store into *PFD the file descriptor open for writing.
815 If the creation fails, return NULL and store -1 into *PFD if the
816 failure is due to file descriptor exhaustion and
817 SURVIVE_FD_EXHAUSTION; otherwise, die. */
819 static struct tempnode *
820 create_temp_file (int *pfd, bool survive_fd_exhaustion)
822 static char const slashbase[] = "/sortXXXXXX";
823 static size_t temp_dir_index;
824 int fd;
825 int saved_errno;
826 char const *temp_dir = temp_dirs[temp_dir_index];
827 size_t len = strlen (temp_dir);
828 struct tempnode *node =
829 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
830 char *file = node->name;
831 struct cs_status cs;
833 memcpy (file, temp_dir, len);
834 memcpy (file + len, slashbase, sizeof slashbase);
835 node->next = NULL;
836 if (++temp_dir_index == temp_dir_count)
837 temp_dir_index = 0;
839 /* Create the temporary file in a critical section, to avoid races. */
840 cs = cs_enter ();
841 fd = mkstemp (file);
842 if (0 <= fd)
844 *temptail = node;
845 temptail = &node->next;
847 saved_errno = errno;
848 cs_leave (cs);
849 errno = saved_errno;
851 if (fd < 0)
853 if (! (survive_fd_exhaustion && errno == EMFILE))
854 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
855 quote (temp_dir));
856 free (node);
857 node = NULL;
860 *pfd = fd;
861 return node;
864 /* Return a stream for FILE, opened with mode HOW. A null FILE means
865 standard output; HOW should be "w". When opening for input, "-"
866 means standard input. To avoid confusion, do not return file
867 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
868 opening an ordinary FILE. Return NULL if unsuccessful.
870 fadvise() is used to specify an access pattern for input files.
871 There are a few hints we could possibly provide,
872 and after careful testing it was decided that
873 specifying POSIX_FADV_SEQUENTIAL was not detrimental
874 to any cases. On Linux 2.6.31, this option doubles
875 the size of read ahead performed and thus was seen to
876 benefit these cases:
877 Merging
878 Sorting with a smaller internal buffer
879 Reading from faster flash devices
881 In _addition_ one could also specify other hints...
883 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
884 at least uses that to _synchronously_ prepopulate the cache
885 with the specified range. While sort does need to
886 read all of its input before outputting, a synchronous
887 read of the whole file up front precludes any processing
888 that sort could do in parallel with the system doing
889 read ahead of the data. This was seen to have negative effects
890 in a couple of cases:
891 Merging
892 Sorting with a smaller internal buffer
893 Note this option was seen to shorten the runtime for sort
894 on a multicore system with lots of RAM and other processes
895 competing for CPU. It could be argued that more explicit
896 scheduling hints with 'nice' et. al. are more appropriate
897 for this situation.
899 POSIX_FADV_NOREUSE is a possibility as it could lower
900 the priority of input data in the cache as sort will
901 only need to process it once. However its functionality
902 has changed over Linux kernel versions and as of 2.6.31
903 it does nothing and thus we can't depend on what it might
904 do in future.
906 POSIX_FADV_DONTNEED is not appropriate for user specified
907 input files, but for temp files we do want to drop the
908 cache immediately after processing. This is done implicitly
909 however when the files are unlinked. */
911 static FILE *
912 stream_open (char const *file, char const *how)
914 if (!file)
915 return stdout;
916 if (*how == 'r')
918 FILE *fp;
919 if (STREQ (file, "-"))
921 have_read_stdin = true;
922 fp = stdin;
924 else
925 fp = fopen (file, how);
926 fadvise (fp, FADVISE_SEQUENTIAL);
927 return fp;
929 return fopen (file, how);
932 /* Same as stream_open, except always return a non-null value; die on
933 failure. */
935 static FILE *
936 xfopen (char const *file, char const *how)
938 FILE *fp = stream_open (file, how);
939 if (!fp)
940 die (_("open failed"), file);
941 return fp;
944 /* Close FP, whose name is FILE, and report any errors. */
946 static void
947 xfclose (FILE *fp, char const *file)
949 switch (fileno (fp))
951 case STDIN_FILENO:
952 /* Allow reading stdin from tty more than once. */
953 if (feof (fp))
954 clearerr (fp);
955 break;
957 case STDOUT_FILENO:
958 /* Don't close stdout just yet. close_stdout does that. */
959 if (fflush (fp) != 0)
960 die (_("fflush failed"), file);
961 break;
963 default:
964 if (fclose (fp) != 0)
965 die (_("close failed"), file);
966 break;
970 static void
971 dup2_or_die (int oldfd, int newfd)
973 if (dup2 (oldfd, newfd) < 0)
974 error (SORT_FAILURE, errno, _("dup2 failed"));
977 /* Fork a child process for piping to and do common cleanup. The
978 TRIES parameter tells us how many times to try to fork before
979 giving up. Return the PID of the child, or -1 (setting errno)
980 on failure. */
982 static pid_t
983 pipe_fork (int pipefds[2], size_t tries)
985 #if HAVE_WORKING_FORK
986 struct tempnode *saved_temphead;
987 int saved_errno;
988 double wait_retry = 0.25;
989 pid_t pid IF_LINT ( = -1);
990 struct cs_status cs;
992 if (pipe (pipefds) < 0)
993 return -1;
995 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
996 uncontrolled subprocess generation can hurt performance significantly.
997 Allow at most NMERGE + 2 subprocesses, on the theory that there
998 may be some useful parallelism by letting compression for the
999 previous merge finish (1 subprocess) in parallel with the current
1000 merge (NMERGE + 1 subprocesses). */
1002 if (nmerge + 1 < nprocs)
1003 reap_some ();
1005 while (tries--)
1007 /* This is so the child process won't delete our temp files
1008 if it receives a signal before exec-ing. */
1009 cs = cs_enter ();
1010 saved_temphead = temphead;
1011 temphead = NULL;
1013 pid = fork ();
1014 saved_errno = errno;
1015 if (pid)
1016 temphead = saved_temphead;
1018 cs_leave (cs);
1019 errno = saved_errno;
1021 if (0 <= pid || errno != EAGAIN)
1022 break;
1023 else
1025 xnanosleep (wait_retry);
1026 wait_retry *= 2;
1027 reap_exited ();
1031 if (pid < 0)
1033 saved_errno = errno;
1034 close (pipefds[0]);
1035 close (pipefds[1]);
1036 errno = saved_errno;
1038 else if (pid == 0)
1040 close (STDIN_FILENO);
1041 close (STDOUT_FILENO);
1043 else
1044 ++nprocs;
1046 return pid;
1048 #else /* ! HAVE_WORKING_FORK */
1049 return -1;
1050 #endif
1053 /* Create a temporary file and, if asked for, start a compressor
1054 to that file. Set *PFP to the file handle and return
1055 the address of the new temp node. If the creation
1056 fails, return NULL if the failure is due to file descriptor
1057 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1059 static struct tempnode *
1060 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1062 int tempfd;
1063 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1064 if (! node)
1065 return NULL;
1067 node->state = UNCOMPRESSED;
1069 if (compress_program)
1071 int pipefds[2];
1073 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1074 if (0 < node->pid)
1076 close (tempfd);
1077 close (pipefds[0]);
1078 tempfd = pipefds[1];
1080 register_proc (node);
1082 else if (node->pid == 0)
1084 close (pipefds[1]);
1085 dup2_or_die (tempfd, STDOUT_FILENO);
1086 close (tempfd);
1087 dup2_or_die (pipefds[0], STDIN_FILENO);
1088 close (pipefds[0]);
1090 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1091 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1092 compress_program);
1096 *pfp = fdopen (tempfd, "w");
1097 if (! *pfp)
1098 die (_("couldn't create temporary file"), node->name);
1100 return node;
1103 /* Create a temporary file and, if asked for, start a compressor
1104 to that file. Set *PFP to the file handle and return the address
1105 of the new temp node. Die on failure. */
1107 static struct tempnode *
1108 create_temp (FILE **pfp)
1110 return maybe_create_temp (pfp, false);
1113 /* Open a compressed temp file and start a decompression process through
1114 which to filter the input. Return NULL (setting errno to
1115 EMFILE) if we ran out of file descriptors, and die on any other
1116 kind of failure. */
1118 static FILE *
1119 open_temp (struct tempnode *temp)
1121 int tempfd, pipefds[2];
1122 FILE *fp = NULL;
1124 if (temp->state == UNREAPED)
1125 wait_proc (temp->pid);
1127 tempfd = open (temp->name, O_RDONLY);
1128 if (tempfd < 0)
1129 return NULL;
1131 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1133 switch (child)
1135 case -1:
1136 if (errno != EMFILE)
1137 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1138 compress_program);
1139 close (tempfd);
1140 errno = EMFILE;
1141 break;
1143 case 0:
1144 close (pipefds[0]);
1145 dup2_or_die (tempfd, STDIN_FILENO);
1146 close (tempfd);
1147 dup2_or_die (pipefds[1], STDOUT_FILENO);
1148 close (pipefds[1]);
1150 execlp (compress_program, compress_program, "-d", (char *) NULL);
1151 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1152 compress_program);
1154 default:
1155 temp->pid = child;
1156 register_proc (temp);
1157 close (tempfd);
1158 close (pipefds[1]);
1160 fp = fdopen (pipefds[0], "r");
1161 if (! fp)
1163 int saved_errno = errno;
1164 close (pipefds[0]);
1165 errno = saved_errno;
1167 break;
1170 return fp;
1173 /* Append DIR to the array of temporary directory names. */
1174 static void
1175 add_temp_dir (char const *dir)
1177 if (temp_dir_count == temp_dir_alloc)
1178 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1180 temp_dirs[temp_dir_count++] = dir;
1183 /* Remove NAME from the list of temporary files. */
1185 static void
1186 zaptemp (char const *name)
1188 struct tempnode *volatile *pnode;
1189 struct tempnode *node;
1190 struct tempnode *next;
1191 int unlink_status;
1192 int unlink_errno = 0;
1193 struct cs_status cs;
1195 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1196 continue;
1198 if (node->state == UNREAPED)
1199 wait_proc (node->pid);
1201 /* Unlink the temporary file in a critical section to avoid races. */
1202 next = node->next;
1203 cs = cs_enter ();
1204 unlink_status = unlink (name);
1205 unlink_errno = errno;
1206 *pnode = next;
1207 cs_leave (cs);
1209 if (unlink_status != 0)
1210 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1211 if (! next)
1212 temptail = pnode;
1213 free (node);
1216 #if HAVE_NL_LANGINFO
1218 static int
1219 struct_month_cmp (void const *m1, void const *m2)
1221 struct month const *month1 = m1;
1222 struct month const *month2 = m2;
1223 return strcmp (month1->name, month2->name);
1226 #endif
1228 /* Initialize the character class tables. */
1230 static void
1231 inittables (void)
1233 size_t i;
1235 for (i = 0; i < UCHAR_LIM; ++i)
1237 blanks[i] = !! isblank (i);
1238 nonprinting[i] = ! isprint (i);
1239 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1240 fold_toupper[i] = toupper (i);
1243 #if HAVE_NL_LANGINFO
1244 /* If we're not in the "C" locale, read different names for months. */
1245 if (hard_LC_TIME)
1247 for (i = 0; i < MONTHS_PER_YEAR; i++)
1249 char const *s;
1250 size_t s_len;
1251 size_t j, k;
1252 char *name;
1254 s = nl_langinfo (ABMON_1 + i);
1255 s_len = strlen (s);
1256 monthtab[i].name = name = xmalloc (s_len + 1);
1257 monthtab[i].val = i + 1;
1259 for (j = k = 0; j < s_len; j++)
1260 if (! isblank (to_uchar (s[j])))
1261 name[k++] = fold_toupper[to_uchar (s[j])];
1262 name[k] = '\0';
1264 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1266 #endif
1269 /* Specify how many inputs may be merged at once.
1270 This may be set on the command-line with the
1271 --batch-size option. */
1272 static void
1273 specify_nmerge (int oi, char c, char const *s)
1275 uintmax_t n;
1276 struct rlimit rlimit;
1277 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1279 /* Try to find out how many file descriptors we'll be able
1280 to open. We need at least nmerge + 3 (STDIN_FILENO,
1281 STDOUT_FILENO and STDERR_FILENO). */
1282 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1283 ? rlimit.rlim_cur
1284 : OPEN_MAX)
1285 - 3);
1287 if (e == LONGINT_OK)
1289 nmerge = n;
1290 if (nmerge != n)
1291 e = LONGINT_OVERFLOW;
1292 else
1294 if (nmerge < 2)
1296 error (0, 0, _("invalid --%s argument %s"),
1297 long_options[oi].name, quote (s));
1298 error (SORT_FAILURE, 0,
1299 _("minimum --%s argument is %s"),
1300 long_options[oi].name, quote ("2"));
1302 else if (max_nmerge < nmerge)
1304 e = LONGINT_OVERFLOW;
1306 else
1307 return;
1311 if (e == LONGINT_OVERFLOW)
1313 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1314 error (0, 0, _("--%s argument %s too large"),
1315 long_options[oi].name, quote (s));
1316 error (SORT_FAILURE, 0,
1317 _("maximum --%s argument with current rlimit is %s"),
1318 long_options[oi].name,
1319 uinttostr (max_nmerge, max_nmerge_buf));
1321 else
1322 xstrtol_fatal (e, oi, c, long_options, s);
1325 /* Specify the amount of main memory to use when sorting. */
1326 static void
1327 specify_sort_size (int oi, char c, char const *s)
1329 uintmax_t n;
1330 char *suffix;
1331 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1333 /* The default unit is KiB. */
1334 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1336 if (n <= UINTMAX_MAX / 1024)
1337 n *= 1024;
1338 else
1339 e = LONGINT_OVERFLOW;
1342 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1343 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1344 switch (suffix[0])
1346 case 'b':
1347 e = LONGINT_OK;
1348 break;
1350 case '%':
1352 double mem = physmem_total () * n / 100;
1354 /* Use "<", not "<=", to avoid problems with rounding. */
1355 if (mem < UINTMAX_MAX)
1357 n = mem;
1358 e = LONGINT_OK;
1360 else
1361 e = LONGINT_OVERFLOW;
1363 break;
1366 if (e == LONGINT_OK)
1368 /* If multiple sort sizes are specified, take the maximum, so
1369 that option order does not matter. */
1370 if (n < sort_size)
1371 return;
1373 sort_size = n;
1374 if (sort_size == n)
1376 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1377 return;
1380 e = LONGINT_OVERFLOW;
1383 xstrtol_fatal (e, oi, c, long_options, s);
1386 /* Specify the number of threads to spawn during internal sort. */
1387 static size_t
1388 specify_nthreads (int oi, char c, char const *s)
1390 unsigned long int nthreads;
1391 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1392 if (e == LONGINT_OVERFLOW)
1393 return SIZE_MAX;
1394 if (e != LONGINT_OK)
1395 xstrtol_fatal (e, oi, c, long_options, s);
1396 if (SIZE_MAX < nthreads)
1397 nthreads = SIZE_MAX;
1398 if (nthreads == 0)
1399 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1400 return nthreads;
1403 /* Return the default sort size. */
1404 static size_t
1405 default_sort_size (void)
1407 /* Let SIZE be MEM, but no more than the maximum object size or
1408 system resource limits. Don't bother to check for values like
1409 RLIM_INFINITY since in practice they are not much less than SIZE_MAX. */
1410 size_t size = SIZE_MAX;
1411 struct rlimit rlimit;
1412 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1413 size = rlimit.rlim_cur;
1414 #ifdef RLIMIT_AS
1415 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1416 size = rlimit.rlim_cur;
1417 #endif
1419 /* Leave a large safety margin for the above limits, as failure can
1420 occur when they are exceeded. */
1421 size /= 2;
1423 #ifdef RLIMIT_RSS
1424 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1425 Exceeding RSS is not fatal, but can be quite slow. */
1426 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1427 size = rlimit.rlim_cur / 16 * 15;
1428 #endif
1430 /* Let MEM be available memory or 1/8 of total memory, whichever
1431 is greater. */
1432 double avail = physmem_available ();
1433 double total = physmem_total ();
1434 double mem = MAX (avail, total / 8);
1436 /* Return the minimum of MEM and SIZE, but no less than
1437 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1438 right when only one argument is floating point. */
1439 if (mem < size)
1440 size = mem;
1441 return MAX (size, MIN_SORT_SIZE);
1444 /* Return the sort buffer size to use with the input files identified
1445 by FPS and FILES, which are alternate names of the same files.
1446 NFILES gives the number of input files; NFPS may be less. Assume
1447 that each input line requires LINE_BYTES extra bytes' worth of line
1448 information. Do not exceed the size bound specified by the user
1449 (or a default size bound, if the user does not specify one). */
1451 static size_t
1452 sort_buffer_size (FILE *const *fps, size_t nfps,
1453 char *const *files, size_t nfiles,
1454 size_t line_bytes)
1456 /* A bound on the input size. If zero, the bound hasn't been
1457 determined yet. */
1458 static size_t size_bound;
1460 /* In the worst case, each input byte is a newline. */
1461 size_t worst_case_per_input_byte = line_bytes + 1;
1463 /* Keep enough room for one extra input line and an extra byte.
1464 This extra room might be needed when preparing to read EOF. */
1465 size_t size = worst_case_per_input_byte + 1;
1467 size_t i;
1469 for (i = 0; i < nfiles; i++)
1471 struct stat st;
1472 off_t file_size;
1473 size_t worst_case;
1475 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1476 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1477 : stat (files[i], &st))
1478 != 0)
1479 die (_("stat failed"), files[i]);
1481 if (S_ISREG (st.st_mode))
1482 file_size = st.st_size;
1483 else
1485 /* The file has unknown size. If the user specified a sort
1486 buffer size, use that; otherwise, guess the size. */
1487 if (sort_size)
1488 return sort_size;
1489 file_size = INPUT_FILE_SIZE_GUESS;
1492 if (! size_bound)
1494 size_bound = sort_size;
1495 if (! size_bound)
1496 size_bound = default_sort_size ();
1499 /* Add the amount of memory needed to represent the worst case
1500 where the input consists entirely of newlines followed by a
1501 single non-newline. Check for overflow. */
1502 worst_case = file_size * worst_case_per_input_byte + 1;
1503 if (file_size != worst_case / worst_case_per_input_byte
1504 || size_bound - size <= worst_case)
1505 return size_bound;
1506 size += worst_case;
1509 return size;
1512 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1513 must be at least sizeof (struct line). Allocate ALLOC bytes
1514 initially. */
1516 static void
1517 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1519 /* Ensure that the line array is properly aligned. If the desired
1520 size cannot be allocated, repeatedly halve it until allocation
1521 succeeds. The smaller allocation may hurt overall performance,
1522 but that's better than failing. */
1523 while (true)
1525 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1526 buf->buf = malloc (alloc);
1527 if (buf->buf)
1528 break;
1529 alloc /= 2;
1530 if (alloc <= line_bytes + 1)
1531 xalloc_die ();
1534 buf->line_bytes = line_bytes;
1535 buf->alloc = alloc;
1536 buf->used = buf->left = buf->nlines = 0;
1537 buf->eof = false;
1540 /* Return one past the limit of the line array. */
1542 static inline struct line *
1543 buffer_linelim (struct buffer const *buf)
1545 return (struct line *) (buf->buf + buf->alloc);
1548 /* Return a pointer to the first character of the field specified
1549 by KEY in LINE. */
1551 static char *
1552 begfield (struct line const *line, struct keyfield const *key)
1554 char *ptr = line->text, *lim = ptr + line->length - 1;
1555 size_t sword = key->sword;
1556 size_t schar = key->schar;
1558 /* The leading field separator itself is included in a field when -t
1559 is absent. */
1561 if (tab != TAB_DEFAULT)
1562 while (ptr < lim && sword--)
1564 while (ptr < lim && *ptr != tab)
1565 ++ptr;
1566 if (ptr < lim)
1567 ++ptr;
1569 else
1570 while (ptr < lim && sword--)
1572 while (ptr < lim && blanks[to_uchar (*ptr)])
1573 ++ptr;
1574 while (ptr < lim && !blanks[to_uchar (*ptr)])
1575 ++ptr;
1578 /* If we're ignoring leading blanks when computing the Start
1579 of the field, skip past them here. */
1580 if (key->skipsblanks)
1581 while (ptr < lim && blanks[to_uchar (*ptr)])
1582 ++ptr;
1584 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1585 ptr = MIN (lim, ptr + schar);
1587 return ptr;
1590 /* Return the limit of (a pointer to the first character after) the field
1591 in LINE specified by KEY. */
1593 static char *
1594 limfield (struct line const *line, struct keyfield const *key)
1596 char *ptr = line->text, *lim = ptr + line->length - 1;
1597 size_t eword = key->eword, echar = key->echar;
1599 if (echar == 0)
1600 eword++; /* Skip all of end field. */
1602 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1603 whichever comes first. If there are more than EWORD fields, leave
1604 PTR pointing at the beginning of the field having zero-based index,
1605 EWORD. If a delimiter character was specified (via -t), then that
1606 'beginning' is the first character following the delimiting TAB.
1607 Otherwise, leave PTR pointing at the first 'blank' character after
1608 the preceding field. */
1609 if (tab != TAB_DEFAULT)
1610 while (ptr < lim && eword--)
1612 while (ptr < lim && *ptr != tab)
1613 ++ptr;
1614 if (ptr < lim && (eword || echar))
1615 ++ptr;
1617 else
1618 while (ptr < lim && eword--)
1620 while (ptr < lim && blanks[to_uchar (*ptr)])
1621 ++ptr;
1622 while (ptr < lim && !blanks[to_uchar (*ptr)])
1623 ++ptr;
1626 #ifdef POSIX_UNSPECIFIED
1627 /* The following block of code makes GNU sort incompatible with
1628 standard Unix sort, so it's ifdef'd out for now.
1629 The POSIX spec isn't clear on how to interpret this.
1630 FIXME: request clarification.
1632 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1633 Date: Thu, 30 May 96 12:20:41 -0400
1634 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1636 [...]I believe I've found another bug in 'sort'.
1638 $ cat /tmp/sort.in
1639 a b c 2 d
1640 pq rs 1 t
1641 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1642 a b c 2 d
1643 pq rs 1 t
1644 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1645 pq rs 1 t
1646 a b c 2 d
1648 Unix sort produced the answer I expected: sort on the single character
1649 in column 7. GNU sort produced different results, because it disagrees
1650 on the interpretation of the key-end spec "M.N". Unix sort reads this
1651 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1652 "skip M-1 fields, then either N-1 characters or the rest of the current
1653 field, whichever comes first". This extra clause applies only to
1654 key-ends, not key-starts.
1657 /* Make LIM point to the end of (one byte past) the current field. */
1658 if (tab != TAB_DEFAULT)
1660 char *newlim;
1661 newlim = memchr (ptr, tab, lim - ptr);
1662 if (newlim)
1663 lim = newlim;
1665 else
1667 char *newlim;
1668 newlim = ptr;
1669 while (newlim < lim && blanks[to_uchar (*newlim)])
1670 ++newlim;
1671 while (newlim < lim && !blanks[to_uchar (*newlim)])
1672 ++newlim;
1673 lim = newlim;
1675 #endif
1677 if (echar != 0) /* We need to skip over a portion of the end field. */
1679 /* If we're ignoring leading blanks when computing the End
1680 of the field, skip past them here. */
1681 if (key->skipeblanks)
1682 while (ptr < lim && blanks[to_uchar (*ptr)])
1683 ++ptr;
1685 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1686 ptr = MIN (lim, ptr + echar);
1689 return ptr;
1692 /* Fill BUF reading from FP, moving buf->left bytes from the end
1693 of buf->buf to the beginning first. If EOF is reached and the
1694 file wasn't terminated by a newline, supply one. Set up BUF's line
1695 table too. FILE is the name of the file corresponding to FP.
1696 Return true if some input was read. */
1698 static bool
1699 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1701 struct keyfield const *key = keylist;
1702 char eol = eolchar;
1703 size_t line_bytes = buf->line_bytes;
1704 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1706 if (buf->eof)
1707 return false;
1709 if (buf->used != buf->left)
1711 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1712 buf->used = buf->left;
1713 buf->nlines = 0;
1716 while (true)
1718 char *ptr = buf->buf + buf->used;
1719 struct line *linelim = buffer_linelim (buf);
1720 struct line *line = linelim - buf->nlines;
1721 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1722 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1724 while (line_bytes + 1 < avail)
1726 /* Read as many bytes as possible, but do not read so many
1727 bytes that there might not be enough room for the
1728 corresponding line array. The worst case is when the
1729 rest of the input file consists entirely of newlines,
1730 except that the last byte is not a newline. */
1731 size_t readsize = (avail - 1) / (line_bytes + 1);
1732 size_t bytes_read = fread (ptr, 1, readsize, fp);
1733 char *ptrlim = ptr + bytes_read;
1734 char *p;
1735 avail -= bytes_read;
1737 if (bytes_read != readsize)
1739 if (ferror (fp))
1740 die (_("read failed"), file);
1741 if (feof (fp))
1743 buf->eof = true;
1744 if (buf->buf == ptrlim)
1745 return false;
1746 if (line_start != ptrlim && ptrlim[-1] != eol)
1747 *ptrlim++ = eol;
1751 /* Find and record each line in the just-read input. */
1752 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1754 /* Delimit the line with NUL. This eliminates the need to
1755 temporarily replace the last byte with NUL when calling
1756 xmemcoll(), which increases performance. */
1757 *p = '\0';
1758 ptr = p + 1;
1759 line--;
1760 line->text = line_start;
1761 line->length = ptr - line_start;
1762 mergesize = MAX (mergesize, line->length);
1763 avail -= line_bytes;
1765 if (key)
1767 /* Precompute the position of the first key for
1768 efficiency. */
1769 line->keylim = (key->eword == SIZE_MAX
1771 : limfield (line, key));
1773 if (key->sword != SIZE_MAX)
1774 line->keybeg = begfield (line, key);
1775 else
1777 if (key->skipsblanks)
1778 while (blanks[to_uchar (*line_start)])
1779 line_start++;
1780 line->keybeg = line_start;
1784 line_start = ptr;
1787 ptr = ptrlim;
1788 if (buf->eof)
1789 break;
1792 buf->used = ptr - buf->buf;
1793 buf->nlines = buffer_linelim (buf) - line;
1794 if (buf->nlines != 0)
1796 buf->left = ptr - line_start;
1797 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1798 return true;
1802 /* The current input line is too long to fit in the buffer.
1803 Double the buffer size and try again, keeping it properly
1804 aligned. */
1805 size_t line_alloc = buf->alloc / sizeof (struct line);
1806 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1807 buf->alloc = line_alloc * sizeof (struct line);
1812 /* Table that maps characters to order-of-magnitude values. */
1813 static char const unit_order[UCHAR_LIM] =
1815 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1816 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1817 /* This initializer syntax works on all C99 hosts. For now, use
1818 it only on non-ASCII hosts, to ease the pain of porting to
1819 pre-C99 ASCII hosts. */
1820 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1821 ['k']=1,
1822 #else
1823 /* Generate the following table with this command:
1824 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1825 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1826 |fmt */
1827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1830 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1833 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1838 #endif
1841 /* Return an integer that represents the order of magnitude of the
1842 unit following the number. The number may contain thousands
1843 separators and a decimal point, but it may not contain leading blanks.
1844 Negative numbers get negative orders; zero numbers have a zero order. */
1846 static int _GL_ATTRIBUTE_PURE
1847 find_unit_order (char const *number)
1849 bool minus_sign = (*number == '-');
1850 char const *p = number + minus_sign;
1851 int nonzero = 0;
1852 unsigned char ch;
1854 /* Scan to end of number.
1855 Decimals or separators not followed by digits stop the scan.
1856 Numbers ending in decimals or separators are thus considered
1857 to be lacking in units.
1858 FIXME: add support for multibyte thousands_sep and decimal_point. */
1862 while (ISDIGIT (ch = *p++))
1863 nonzero |= ch - '0';
1865 while (ch == thousands_sep);
1867 if (ch == decimal_point)
1868 while (ISDIGIT (ch = *p++))
1869 nonzero |= ch - '0';
1871 if (nonzero)
1873 int order = unit_order[ch];
1874 return (minus_sign ? -order : order);
1876 else
1877 return 0;
1880 /* Compare numbers A and B ending in units with SI or IEC prefixes
1881 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1883 static int
1884 human_numcompare (char const *a, char const *b)
1886 while (blanks[to_uchar (*a)])
1887 a++;
1888 while (blanks[to_uchar (*b)])
1889 b++;
1891 int diff = find_unit_order (a) - find_unit_order (b);
1892 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1895 /* Compare strings A and B as numbers without explicitly converting them to
1896 machine numbers. Comparatively slow for short strings, but asymptotically
1897 hideously fast. */
1899 static int
1900 numcompare (char const *a, char const *b)
1902 while (blanks[to_uchar (*a)])
1903 a++;
1904 while (blanks[to_uchar (*b)])
1905 b++;
1907 return strnumcmp (a, b, decimal_point, thousands_sep);
1910 /* Work around a problem whereby the long double value returned by glibc's
1911 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1912 A and B before calling strtold. FIXME: remove this function once
1913 gnulib guarantees that strtold's result is always well defined. */
1914 static int
1915 nan_compare (char const *sa, char const *sb)
1917 long_double a;
1918 memset (&a, 0, sizeof a);
1919 a = strtold (sa, NULL);
1921 long_double b;
1922 memset (&b, 0, sizeof b);
1923 b = strtold (sb, NULL);
1925 return memcmp (&a, &b, sizeof a);
1928 static int
1929 general_numcompare (char const *sa, char const *sb)
1931 /* FIXME: maybe add option to try expensive FP conversion
1932 only if A and B can't be compared more cheaply/accurately. */
1934 char *ea;
1935 char *eb;
1936 long_double a = strtold (sa, &ea);
1937 long_double b = strtold (sb, &eb);
1939 /* Put conversion errors at the start of the collating sequence. */
1940 if (sa == ea)
1941 return sb == eb ? 0 : -1;
1942 if (sb == eb)
1943 return 1;
1945 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1946 conversion errors but before numbers; sort them by internal
1947 bit-pattern, for lack of a more portable alternative. */
1948 return (a < b ? -1
1949 : a > b ? 1
1950 : a == b ? 0
1951 : b == b ? -1
1952 : a == a ? 1
1953 : nan_compare (sa, sb));
1956 /* Return an integer in 1..12 of the month name MONTH.
1957 Return 0 if the name in S is not recognized. */
1959 static int
1960 getmonth (char const *month, char **ea)
1962 size_t lo = 0;
1963 size_t hi = MONTHS_PER_YEAR;
1965 while (blanks[to_uchar (*month)])
1966 month++;
1970 size_t ix = (lo + hi) / 2;
1971 char const *m = month;
1972 char const *n = monthtab[ix].name;
1974 for (;; m++, n++)
1976 if (!*n)
1978 if (ea)
1979 *ea = (char *) m;
1980 return monthtab[ix].val;
1982 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
1984 hi = ix;
1985 break;
1987 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
1989 lo = ix + 1;
1990 break;
1994 while (lo < hi);
1996 return 0;
1999 /* A randomly chosen MD5 state, used for random comparison. */
2000 static struct md5_ctx random_md5_state;
2002 /* Initialize the randomly chosen MD5 state. */
2004 static void
2005 random_md5_state_init (char const *random_source)
2007 unsigned char buf[MD5_DIGEST_SIZE];
2008 struct randread_source *r = randread_new (random_source, sizeof buf);
2009 if (! r)
2010 die (_("open failed"), random_source);
2011 randread (r, buf, sizeof buf);
2012 if (randread_free (r) != 0)
2013 die (_("close failed"), random_source);
2014 md5_init_ctx (&random_md5_state);
2015 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2018 /* This is like strxfrm, except it reports any error and exits. */
2020 static size_t
2021 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2023 errno = 0;
2024 size_t translated_size = strxfrm (dest, src, destsize);
2026 if (errno)
2028 error (0, errno, _("string transformation failed"));
2029 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2030 error (SORT_FAILURE, 0,
2031 _("the untransformed string was %s"),
2032 quotearg_n_style (0, locale_quoting_style, src));
2035 return translated_size;
2038 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2039 using one or more random hash functions. TEXTA[LENA] and
2040 TEXTB[LENB] must be zero. */
2042 static int
2043 compare_random (char *restrict texta, size_t lena,
2044 char *restrict textb, size_t lenb)
2046 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2047 data. This is used to break ties if there is a checksum
2048 collision, and this is good enough given the astronomically low
2049 probability of a collision. */
2050 int xfrm_diff = 0;
2052 char stackbuf[4000];
2053 char *buf = stackbuf;
2054 size_t bufsize = sizeof stackbuf;
2055 void *allocated = NULL;
2056 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2057 struct md5_ctx s[2];
2058 s[0] = s[1] = random_md5_state;
2060 if (hard_LC_COLLATE)
2062 char const *lima = texta + lena;
2063 char const *limb = textb + lenb;
2065 while (true)
2067 /* Transform the text into the basis of comparison, so that byte
2068 strings that would otherwise considered to be equal are
2069 considered equal here even if their bytes differ.
2071 Each time through this loop, transform one
2072 null-terminated string's worth from TEXTA or from TEXTB
2073 or both. That way, there's no need to store the
2074 transformation of the whole line, if it contains many
2075 null-terminated strings. */
2077 /* Store the transformed data into a big-enough buffer. */
2079 /* A 3X size guess avoids the overhead of calling strxfrm
2080 twice on typical implementations. Don't worry about
2081 size_t overflow, as the guess need not be correct. */
2082 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2083 if (bufsize < guess_bufsize)
2085 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2086 free (allocated);
2087 buf = allocated = malloc (bufsize);
2088 if (! buf)
2090 buf = stackbuf;
2091 bufsize = sizeof stackbuf;
2095 size_t sizea =
2096 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2097 bool a_fits = sizea <= bufsize;
2098 size_t sizeb =
2099 (textb < limb
2100 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2101 (a_fits ? bufsize - sizea : 0))
2102 + 1)
2103 : 0);
2105 if (! (a_fits && sizea + sizeb <= bufsize))
2107 bufsize = sizea + sizeb;
2108 if (bufsize < SIZE_MAX / 3)
2109 bufsize = bufsize * 3 / 2;
2110 free (allocated);
2111 buf = allocated = xmalloc (bufsize);
2112 if (texta < lima)
2113 strxfrm (buf, texta, sizea);
2114 if (textb < limb)
2115 strxfrm (buf + sizea, textb, sizeb);
2118 /* Advance past NULs to the next part of each input string,
2119 exiting the loop if both strings are exhausted. When
2120 exiting the loop, prepare to finish off the tiebreaker
2121 comparison properly. */
2122 if (texta < lima)
2123 texta += strlen (texta) + 1;
2124 if (textb < limb)
2125 textb += strlen (textb) + 1;
2126 if (! (texta < lima || textb < limb))
2128 lena = sizea; texta = buf;
2129 lenb = sizeb; textb = buf + sizea;
2130 break;
2133 /* Accumulate the transformed data in the corresponding
2134 checksums. */
2135 md5_process_bytes (buf, sizea, &s[0]);
2136 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2138 /* Update the tiebreaker comparison of the transformed data. */
2139 if (! xfrm_diff)
2141 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2142 if (! xfrm_diff)
2143 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2148 /* Compute and compare the checksums. */
2149 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2150 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2151 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2153 /* Fall back on the tiebreaker if the checksums collide. */
2154 if (! diff)
2156 if (! xfrm_diff)
2158 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2159 if (! xfrm_diff)
2160 xfrm_diff = (lena > lenb) - (lena < lenb);
2163 diff = xfrm_diff;
2166 free (allocated);
2168 return diff;
2171 /* Return the printable width of the block of memory starting at
2172 TEXT and ending just before LIM, counting each tab as one byte.
2173 FIXME: Should we generally be counting non printable chars? */
2175 static size_t
2176 debug_width (char const *text, char const *lim)
2178 size_t width = mbsnwidth (text, lim - text, 0);
2179 while (text < lim)
2180 width += (*text++ == '\t');
2181 return width;
2184 /* For debug mode, "underline" a key at the
2185 specified offset and screen width. */
2187 static void
2188 mark_key (size_t offset, size_t width)
2190 while (offset--)
2191 putchar (' ');
2193 if (!width)
2194 printf (_("^ no match for key\n"));
2195 else
2198 putchar ('_');
2199 while (--width);
2201 putchar ('\n');
2205 /* Return true if KEY is a numeric key. */
2207 static inline bool
2208 key_numeric (struct keyfield const *key)
2210 return key->numeric || key->general_numeric || key->human_numeric;
2213 /* For LINE, output a debugging line that underlines KEY in LINE.
2214 If KEY is null, underline the whole line. */
2216 static void
2217 debug_key (struct line const *line, struct keyfield const *key)
2219 char *text = line->text;
2220 char *beg = text;
2221 char *lim = text + line->length - 1;
2223 if (key)
2225 if (key->sword != SIZE_MAX)
2226 beg = begfield (line, key);
2227 if (key->eword != SIZE_MAX)
2228 lim = limfield (line, key);
2230 if (key->skipsblanks || key->month || key_numeric (key))
2232 char saved = *lim;
2233 *lim = '\0';
2235 while (blanks[to_uchar (*beg)])
2236 beg++;
2238 char *tighter_lim = beg;
2240 if (lim < beg)
2241 tighter_lim = lim;
2242 else if (key->month)
2243 getmonth (beg, &tighter_lim);
2244 else if (key->general_numeric)
2245 ignore_value (strtold (beg, &tighter_lim));
2246 else if (key->numeric || key->human_numeric)
2248 char *p = beg + (beg < lim && *beg == '-');
2249 bool found_digit = false;
2250 unsigned char ch;
2254 while (ISDIGIT (ch = *p++))
2255 found_digit = true;
2257 while (ch == thousands_sep);
2259 if (ch == decimal_point)
2260 while (ISDIGIT (ch = *p++))
2261 found_digit = true;
2263 if (found_digit)
2264 tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
2266 else
2267 tighter_lim = lim;
2269 *lim = saved;
2270 lim = tighter_lim;
2274 size_t offset = debug_width (text, beg);
2275 size_t width = debug_width (beg, lim);
2276 mark_key (offset, width);
2279 /* Debug LINE by underlining its keys. */
2281 static void
2282 debug_line (struct line const *line)
2284 struct keyfield const *key = keylist;
2287 debug_key (line, key);
2288 while (key && ((key = key->next) || ! (unique || stable)));
2291 /* Return whether sorting options specified for key. */
2293 static bool
2294 default_key_compare (struct keyfield const *key)
2296 return ! (key->ignore
2297 || key->translate
2298 || key->skipsblanks
2299 || key->skipeblanks
2300 || key_numeric (key)
2301 || key->month
2302 || key->version
2303 || key->random
2304 /* || key->reverse */
2308 /* Convert a key to the short options used to specify it. */
2310 static void
2311 key_to_opts (struct keyfield const *key, char *opts)
2313 if (key->skipsblanks || key->skipeblanks)
2314 *opts++ = 'b';/* either disables global -b */
2315 if (key->ignore == nondictionary)
2316 *opts++ = 'd';
2317 if (key->translate)
2318 *opts++ = 'f';
2319 if (key->general_numeric)
2320 *opts++ = 'g';
2321 if (key->human_numeric)
2322 *opts++ = 'h';
2323 if (key->ignore == nonprinting)
2324 *opts++ = 'i';
2325 if (key->month)
2326 *opts++ = 'M';
2327 if (key->numeric)
2328 *opts++ = 'n';
2329 if (key->random)
2330 *opts++ = 'R';
2331 if (key->reverse)
2332 *opts++ = 'r';
2333 if (key->version)
2334 *opts++ = 'V';
2335 *opts = '\0';
2338 /* Output data independent key warnings to stderr. */
2340 static void
2341 key_warnings (struct keyfield const *gkey, bool gkey_only)
2343 struct keyfield const *key;
2344 struct keyfield ugkey = *gkey;
2345 unsigned long keynum = 1;
2347 for (key = keylist; key; key = key->next, keynum++)
2349 if (key->obsolete_used)
2351 size_t sword = key->sword;
2352 size_t eword = key->eword;
2353 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2354 /* obsolescent syntax +A.x -B.y is equivalent to:
2355 -k A+1.x+1,B.y (when y = 0)
2356 -k A+1.x+1,B+1.y (when y > 0) */
2357 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2358 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2359 char *po = obuf;
2360 char *pn = nbuf;
2362 if (sword == SIZE_MAX)
2363 sword++;
2365 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2366 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2367 if (key->eword != SIZE_MAX)
2369 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2370 stpcpy (stpcpy (pn, ","),
2371 umaxtostr (eword + 1
2372 + (key->echar == SIZE_MAX), tmp));
2374 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2375 quote_n (0, obuf), quote_n (1, nbuf));
2378 /* Warn about field specs that will never match. */
2379 if (key->sword != SIZE_MAX && key->eword < key->sword)
2380 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2382 /* Warn about significant leading blanks. */
2383 bool implicit_skip = key_numeric (key) || key->month;
2384 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2385 && !(key->schar || key->echar);
2386 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2387 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2388 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2389 || (!key->skipsblanks && key->schar)
2390 || (!key->skipeblanks && key->echar)))
2391 error (0, 0, _("leading blanks are significant in key %lu; "
2392 "consider also specifying 'b'"), keynum);
2394 /* Warn about numeric comparisons spanning fields,
2395 as field delimiters could be interpreted as part
2396 of the number (maybe only in other locales). */
2397 if (!gkey_only && key_numeric (key))
2399 size_t sword = key->sword + 1;
2400 size_t eword = key->eword + 1;
2401 if (!sword)
2402 sword++;
2403 if (!eword || sword < eword)
2404 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2405 keynum);
2408 /* Flag global options not copied or specified in any key. */
2409 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2410 ugkey.ignore = NULL;
2411 if (ugkey.translate && (ugkey.translate == key->translate))
2412 ugkey.translate = NULL;
2413 ugkey.skipsblanks &= !key->skipsblanks;
2414 ugkey.skipeblanks &= !key->skipeblanks;
2415 ugkey.month &= !key->month;
2416 ugkey.numeric &= !key->numeric;
2417 ugkey.general_numeric &= !key->general_numeric;
2418 ugkey.human_numeric &= !key->human_numeric;
2419 ugkey.random &= !key->random;
2420 ugkey.version &= !key->version;
2421 ugkey.reverse &= !key->reverse;
2424 /* Warn about ignored global options flagged above.
2425 Note if gkey is the only one in the list, all flags are cleared. */
2426 if (!default_key_compare (&ugkey)
2427 || (ugkey.reverse && (stable || unique) && keylist))
2429 bool ugkey_reverse = ugkey.reverse;
2430 if (!(stable || unique))
2431 ugkey.reverse = false;
2432 /* The following is too big, but guaranteed to be "big enough". */
2433 char opts[sizeof short_options];
2434 key_to_opts (&ugkey, opts);
2435 error (0, 0,
2436 ngettext ("option '-%s' is ignored",
2437 "options '-%s' are ignored",
2438 select_plural (strlen (opts))), opts);
2439 ugkey.reverse = ugkey_reverse;
2441 if (ugkey.reverse && !(stable || unique) && keylist)
2442 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2445 /* Compare two lines A and B trying every key in sequence until there
2446 are no more keys or a difference is found. */
2448 static int
2449 keycompare (struct line const *a, struct line const *b)
2451 struct keyfield *key = keylist;
2453 /* For the first iteration only, the key positions have been
2454 precomputed for us. */
2455 char *texta = a->keybeg;
2456 char *textb = b->keybeg;
2457 char *lima = a->keylim;
2458 char *limb = b->keylim;
2460 int diff;
2462 while (true)
2464 char const *translate = key->translate;
2465 bool const *ignore = key->ignore;
2467 /* Treat field ends before field starts as empty fields. */
2468 lima = MAX (texta, lima);
2469 limb = MAX (textb, limb);
2471 /* Find the lengths. */
2472 size_t lena = lima - texta;
2473 size_t lenb = limb - textb;
2475 if (hard_LC_COLLATE || key_numeric (key)
2476 || key->month || key->random || key->version)
2478 char *ta;
2479 char *tb;
2480 size_t tlena;
2481 size_t tlenb;
2483 char enda IF_LINT (= 0);
2484 char endb IF_LINT (= 0);
2485 void *allocated IF_LINT (= NULL);
2486 char stackbuf[4000];
2488 if (ignore || translate)
2490 /* Compute with copies of the keys, which are the result of
2491 translating or ignoring characters, and which need their
2492 own storage. */
2494 size_t i;
2496 /* Allocate space for copies. */
2497 size_t size = lena + 1 + lenb + 1;
2498 if (size <= sizeof stackbuf)
2499 ta = stackbuf, allocated = NULL;
2500 else
2501 ta = allocated = xmalloc (size);
2502 tb = ta + lena + 1;
2504 /* Put into each copy a version of the key in which the
2505 requested characters are ignored or translated. */
2506 for (tlena = i = 0; i < lena; i++)
2507 if (! (ignore && ignore[to_uchar (texta[i])]))
2508 ta[tlena++] = (translate
2509 ? translate[to_uchar (texta[i])]
2510 : texta[i]);
2511 ta[tlena] = '\0';
2513 for (tlenb = i = 0; i < lenb; i++)
2514 if (! (ignore && ignore[to_uchar (textb[i])]))
2515 tb[tlenb++] = (translate
2516 ? translate[to_uchar (textb[i])]
2517 : textb[i]);
2518 tb[tlenb] = '\0';
2520 else
2522 /* Use the keys in-place, temporarily null-terminated. */
2523 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2524 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2527 if (key->numeric)
2528 diff = numcompare (ta, tb);
2529 else if (key->general_numeric)
2530 diff = general_numcompare (ta, tb);
2531 else if (key->human_numeric)
2532 diff = human_numcompare (ta, tb);
2533 else if (key->month)
2534 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2535 else if (key->random)
2536 diff = compare_random (ta, tlena, tb, tlenb);
2537 else if (key->version)
2538 diff = filevercmp (ta, tb);
2539 else
2541 /* Locale-dependent string sorting. This is slower than
2542 C-locale sorting, which is implemented below. */
2543 if (tlena == 0)
2544 diff = - NONZERO (tlenb);
2545 else if (tlenb == 0)
2546 diff = 1;
2547 else
2548 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2551 if (ignore || translate)
2552 free (allocated);
2553 else
2555 ta[tlena] = enda;
2556 tb[tlenb] = endb;
2559 else if (ignore)
2561 #define CMP_WITH_IGNORE(A, B) \
2562 do \
2564 while (true) \
2566 while (texta < lima && ignore[to_uchar (*texta)]) \
2567 ++texta; \
2568 while (textb < limb && ignore[to_uchar (*textb)]) \
2569 ++textb; \
2570 if (! (texta < lima && textb < limb)) \
2571 break; \
2572 diff = to_uchar (A) - to_uchar (B); \
2573 if (diff) \
2574 goto not_equal; \
2575 ++texta; \
2576 ++textb; \
2579 diff = (texta < lima) - (textb < limb); \
2581 while (0)
2583 if (translate)
2584 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2585 translate[to_uchar (*textb)]);
2586 else
2587 CMP_WITH_IGNORE (*texta, *textb);
2589 else if (lena == 0)
2590 diff = - NONZERO (lenb);
2591 else if (lenb == 0)
2592 goto greater;
2593 else
2595 if (translate)
2597 while (texta < lima && textb < limb)
2599 diff = (to_uchar (translate[to_uchar (*texta++)])
2600 - to_uchar (translate[to_uchar (*textb++)]));
2601 if (diff)
2602 goto not_equal;
2605 else
2607 diff = memcmp (texta, textb, MIN (lena, lenb));
2608 if (diff)
2609 goto not_equal;
2611 diff = lena < lenb ? -1 : lena != lenb;
2614 if (diff)
2615 goto not_equal;
2617 key = key->next;
2618 if (! key)
2619 break;
2621 /* Find the beginning and limit of the next field. */
2622 if (key->eword != SIZE_MAX)
2623 lima = limfield (a, key), limb = limfield (b, key);
2624 else
2625 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2627 if (key->sword != SIZE_MAX)
2628 texta = begfield (a, key), textb = begfield (b, key);
2629 else
2631 texta = a->text, textb = b->text;
2632 if (key->skipsblanks)
2634 while (texta < lima && blanks[to_uchar (*texta)])
2635 ++texta;
2636 while (textb < limb && blanks[to_uchar (*textb)])
2637 ++textb;
2642 return 0;
2644 greater:
2645 diff = 1;
2646 not_equal:
2647 return key->reverse ? -diff : diff;
2650 /* Compare two lines A and B, returning negative, zero, or positive
2651 depending on whether A compares less than, equal to, or greater than B. */
2653 static int
2654 compare (struct line const *a, struct line const *b)
2656 int diff;
2657 size_t alen, blen;
2659 /* First try to compare on the specified keys (if any).
2660 The only two cases with no key at all are unadorned sort,
2661 and unadorned sort -r. */
2662 if (keylist)
2664 diff = keycompare (a, b);
2665 if (diff || unique || stable)
2666 return diff;
2669 /* If the keys all compare equal (or no keys were specified)
2670 fall through to the default comparison. */
2671 alen = a->length - 1, blen = b->length - 1;
2673 if (alen == 0)
2674 diff = - NONZERO (blen);
2675 else if (blen == 0)
2676 diff = 1;
2677 else if (hard_LC_COLLATE)
2679 /* Note xmemcoll0 is a performance enhancement as
2680 it will not unconditionally write '\0' after the
2681 passed in buffers, which was seen to give around
2682 a 3% increase in performance for short lines. */
2683 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2685 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2686 diff = alen < blen ? -1 : alen != blen;
2688 return reverse ? -diff : diff;
2691 /* Write LINE to output stream FP; the output file's name is
2692 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2693 otherwise. If debugging is enabled and FP is standard output,
2694 append some debugging information. */
2696 static void
2697 write_line (struct line const *line, FILE *fp, char const *output_file)
2699 char *buf = line->text;
2700 size_t n_bytes = line->length;
2701 char *ebuf = buf + n_bytes;
2703 if (!output_file && debug)
2705 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2706 char const *c = buf;
2708 while (c < ebuf)
2710 char wc = *c++;
2711 if (wc == '\t')
2712 wc = '>';
2713 else if (c == ebuf)
2714 wc = '\n';
2715 if (fputc (wc, fp) == EOF)
2716 die (_("write failed"), output_file);
2719 debug_line (line);
2721 else
2723 ebuf[-1] = eolchar;
2724 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2725 die (_("write failed"), output_file);
2726 ebuf[-1] = '\0';
2730 /* Check that the lines read from FILE_NAME come in order. Return
2731 true if they are in order. If CHECKONLY == 'c', also print a
2732 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2733 they are not in order. */
2735 static bool
2736 check (char const *file_name, char checkonly)
2738 FILE *fp = xfopen (file_name, "r");
2739 struct buffer buf; /* Input buffer. */
2740 struct line temp; /* Copy of previous line. */
2741 size_t alloc = 0;
2742 uintmax_t line_number = 0;
2743 struct keyfield const *key = keylist;
2744 bool nonunique = ! unique;
2745 bool ordered = true;
2747 initbuf (&buf, sizeof (struct line),
2748 MAX (merge_buffer_size, sort_size));
2749 temp.text = NULL;
2751 while (fillbuf (&buf, fp, file_name))
2753 struct line const *line = buffer_linelim (&buf);
2754 struct line const *linebase = line - buf.nlines;
2756 /* Make sure the line saved from the old buffer contents is
2757 less than or equal to the first line of the new buffer. */
2758 if (alloc && nonunique <= compare (&temp, line - 1))
2760 found_disorder:
2762 if (checkonly == 'c')
2764 struct line const *disorder_line = line - 1;
2765 uintmax_t disorder_line_number =
2766 buffer_linelim (&buf) - disorder_line + line_number;
2767 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2768 fprintf (stderr, _("%s: %s:%s: disorder: "),
2769 program_name, file_name,
2770 umaxtostr (disorder_line_number, hr_buf));
2771 write_line (disorder_line, stderr, _("standard error"));
2774 ordered = false;
2775 break;
2779 /* Compare each line in the buffer with its successor. */
2780 while (linebase < --line)
2781 if (nonunique <= compare (line, line - 1))
2782 goto found_disorder;
2784 line_number += buf.nlines;
2786 /* Save the last line of the buffer. */
2787 if (alloc < line->length)
2791 alloc *= 2;
2792 if (! alloc)
2794 alloc = line->length;
2795 break;
2798 while (alloc < line->length);
2800 free (temp.text);
2801 temp.text = xmalloc (alloc);
2803 memcpy (temp.text, line->text, line->length);
2804 temp.length = line->length;
2805 if (key)
2807 temp.keybeg = temp.text + (line->keybeg - line->text);
2808 temp.keylim = temp.text + (line->keylim - line->text);
2812 xfclose (fp, file_name);
2813 free (buf.buf);
2814 free (temp.text);
2815 return ordered;
2818 /* Open FILES (there are NFILES of them) and store the resulting array
2819 of stream pointers into (*PFPS). Allocate the array. Return the
2820 number of successfully opened files, setting errno if this value is
2821 less than NFILES. */
2823 static size_t
2824 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2826 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2827 int i;
2829 /* Open as many input files as we can. */
2830 for (i = 0; i < nfiles; i++)
2832 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2833 ? open_temp (files[i].temp)
2834 : stream_open (files[i].name, "r"));
2835 if (!fps[i])
2836 break;
2839 return i;
2842 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2843 files (all of which are at the start of the FILES array), and
2844 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2845 FPS is the vector of open stream corresponding to the files.
2846 Close input and output streams before returning.
2847 OUTPUT_FILE gives the name of the output file. If it is NULL,
2848 the output file is standard output. */
2850 static void
2851 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2852 FILE *ofp, char const *output_file, FILE **fps)
2854 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2855 /* Input buffers for each file. */
2856 struct line saved; /* Saved line storage for unique check. */
2857 struct line const *savedline = NULL;
2858 /* &saved if there is a saved line. */
2859 size_t savealloc = 0; /* Size allocated for the saved line. */
2860 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2861 /* Current line in each line table. */
2862 struct line const **base = xnmalloc (nfiles, sizeof *base);
2863 /* Base of each line table. */
2864 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2865 /* Table representing a permutation of fps,
2866 such that cur[ord[0]] is the smallest line
2867 and will be next output. */
2868 size_t i;
2869 size_t j;
2870 size_t t;
2871 struct keyfield const *key = keylist;
2872 saved.text = NULL;
2874 /* Read initial lines from each input file. */
2875 for (i = 0; i < nfiles; )
2877 initbuf (&buffer[i], sizeof (struct line),
2878 MAX (merge_buffer_size, sort_size / nfiles));
2879 if (fillbuf (&buffer[i], fps[i], files[i].name))
2881 struct line const *linelim = buffer_linelim (&buffer[i]);
2882 cur[i] = linelim - 1;
2883 base[i] = linelim - buffer[i].nlines;
2884 i++;
2886 else
2888 /* fps[i] is empty; eliminate it from future consideration. */
2889 xfclose (fps[i], files[i].name);
2890 if (i < ntemps)
2892 ntemps--;
2893 zaptemp (files[i].name);
2895 free (buffer[i].buf);
2896 --nfiles;
2897 for (j = i; j < nfiles; ++j)
2899 files[j] = files[j + 1];
2900 fps[j] = fps[j + 1];
2905 /* Set up the ord table according to comparisons among input lines.
2906 Since this only reorders two items if one is strictly greater than
2907 the other, it is stable. */
2908 for (i = 0; i < nfiles; ++i)
2909 ord[i] = i;
2910 for (i = 1; i < nfiles; ++i)
2911 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2912 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2914 /* Repeatedly output the smallest line until no input remains. */
2915 while (nfiles)
2917 struct line const *smallest = cur[ord[0]];
2919 /* If uniquified output is turned on, output only the first of
2920 an identical series of lines. */
2921 if (unique)
2923 if (savedline && compare (savedline, smallest))
2925 savedline = NULL;
2926 write_line (&saved, ofp, output_file);
2928 if (!savedline)
2930 savedline = &saved;
2931 if (savealloc < smallest->length)
2934 if (! savealloc)
2936 savealloc = smallest->length;
2937 break;
2939 while ((savealloc *= 2) < smallest->length);
2941 free (saved.text);
2942 saved.text = xmalloc (savealloc);
2944 saved.length = smallest->length;
2945 memcpy (saved.text, smallest->text, saved.length);
2946 if (key)
2948 saved.keybeg =
2949 saved.text + (smallest->keybeg - smallest->text);
2950 saved.keylim =
2951 saved.text + (smallest->keylim - smallest->text);
2955 else
2956 write_line (smallest, ofp, output_file);
2958 /* Check if we need to read more lines into core. */
2959 if (base[ord[0]] < smallest)
2960 cur[ord[0]] = smallest - 1;
2961 else
2963 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2965 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2966 cur[ord[0]] = linelim - 1;
2967 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2969 else
2971 /* We reached EOF on fps[ord[0]]. */
2972 for (i = 1; i < nfiles; ++i)
2973 if (ord[i] > ord[0])
2974 --ord[i];
2975 --nfiles;
2976 xfclose (fps[ord[0]], files[ord[0]].name);
2977 if (ord[0] < ntemps)
2979 ntemps--;
2980 zaptemp (files[ord[0]].name);
2982 free (buffer[ord[0]].buf);
2983 for (i = ord[0]; i < nfiles; ++i)
2985 fps[i] = fps[i + 1];
2986 files[i] = files[i + 1];
2987 buffer[i] = buffer[i + 1];
2988 cur[i] = cur[i + 1];
2989 base[i] = base[i + 1];
2991 for (i = 0; i < nfiles; ++i)
2992 ord[i] = ord[i + 1];
2993 continue;
2997 /* The new line just read in may be larger than other lines
2998 already in main memory; push it back in the queue until we
2999 encounter a line larger than it. Optimize for the common
3000 case where the new line is smallest. */
3002 size_t lo = 1;
3003 size_t hi = nfiles;
3004 size_t probe = lo;
3005 size_t ord0 = ord[0];
3006 size_t count_of_smaller_lines;
3008 while (lo < hi)
3010 int cmp = compare (cur[ord0], cur[ord[probe]]);
3011 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3012 hi = probe;
3013 else
3014 lo = probe + 1;
3015 probe = (lo + hi) / 2;
3018 count_of_smaller_lines = lo - 1;
3019 for (j = 0; j < count_of_smaller_lines; j++)
3020 ord[j] = ord[j + 1];
3021 ord[count_of_smaller_lines] = ord0;
3025 if (unique && savedline)
3027 write_line (&saved, ofp, output_file);
3028 free (saved.text);
3031 xfclose (ofp, output_file);
3032 free (fps);
3033 free (buffer);
3034 free (ord);
3035 free (base);
3036 free (cur);
3039 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3040 files (all of which are at the start of the FILES array), and
3041 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3042 Close input and output files before returning.
3043 OUTPUT_FILE gives the name of the output file.
3045 Return the number of files successfully merged. This number can be
3046 less than NFILES if we ran low on file descriptors, but in this
3047 case it is never less than 2. */
3049 static size_t
3050 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3051 FILE *ofp, char const *output_file)
3053 FILE **fps;
3054 size_t nopened = open_input_files (files, nfiles, &fps);
3055 if (nopened < nfiles && nopened < 2)
3056 die (_("open failed"), files[nopened].name);
3057 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3058 return nopened;
3061 /* Merge into T (of size NLINES) the two sorted arrays of lines
3062 LO (with NLINES / 2 members), and
3063 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3064 T and LO point just past their respective arrays, and the arrays
3065 are in reverse order. NLINES must be at least 2. */
3067 static void
3068 mergelines (struct line *restrict t, size_t nlines,
3069 struct line const *restrict lo)
3071 size_t nlo = nlines / 2;
3072 size_t nhi = nlines - nlo;
3073 struct line *hi = t - nlo;
3075 while (true)
3076 if (compare (lo - 1, hi - 1) <= 0)
3078 *--t = *--lo;
3079 if (! --nlo)
3081 /* HI must equal T now, and there is no need to copy from
3082 HI to T. */
3083 return;
3086 else
3088 *--t = *--hi;
3089 if (! --nhi)
3092 *--t = *--lo;
3093 while (--nlo);
3095 return;
3100 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3101 Do this all within one thread. NLINES must be at least 2.
3102 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3103 Otherwise the sort is in-place and TEMP is half-sized.
3104 The input and output arrays are in reverse order, and LINES and
3105 TEMP point just past the end of their respective arrays.
3107 Use a recursive divide-and-conquer algorithm, in the style
3108 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3109 the optimization suggested by exercise 5.2.4-10; this requires room
3110 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3111 writes that this memory optimization was originally published by
3112 D. A. Bell, Comp J. 1 (1958), 75. */
3114 static void
3115 sequential_sort (struct line *restrict lines, size_t nlines,
3116 struct line *restrict temp, bool to_temp)
3118 if (nlines == 2)
3120 /* Declare 'swap' as int, not bool, to work around a bug
3121 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3122 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3123 int swap = (0 < compare (&lines[-1], &lines[-2]));
3124 if (to_temp)
3126 temp[-1] = lines[-1 - swap];
3127 temp[-2] = lines[-2 + swap];
3129 else if (swap)
3131 temp[-1] = lines[-1];
3132 lines[-1] = lines[-2];
3133 lines[-2] = temp[-1];
3136 else
3138 size_t nlo = nlines / 2;
3139 size_t nhi = nlines - nlo;
3140 struct line *lo = lines;
3141 struct line *hi = lines - nlo;
3143 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3144 if (1 < nlo)
3145 sequential_sort (lo, nlo, temp, !to_temp);
3146 else if (!to_temp)
3147 temp[-1] = lo[-1];
3149 struct line *dest;
3150 struct line const *sorted_lo;
3151 if (to_temp)
3153 dest = temp;
3154 sorted_lo = lines;
3156 else
3158 dest = lines;
3159 sorted_lo = temp;
3161 mergelines (dest, nlines, sorted_lo);
3165 static struct merge_node *init_node (struct merge_node *restrict,
3166 struct merge_node *restrict,
3167 struct line *, size_t, size_t, bool);
3170 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3171 lines, with destination DEST. */
3172 static struct merge_node *
3173 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3175 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3177 struct merge_node *root = merge_tree;
3178 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3179 root->dest = NULL;
3180 root->nlo = root->nhi = nlines;
3181 root->parent = NULL;
3182 root->level = MERGE_END;
3183 root->queued = false;
3184 pthread_mutex_init (&root->lock, NULL);
3186 init_node (root, root + 1, dest, nthreads, nlines, false);
3187 return merge_tree;
3190 /* Destroy the merge tree. */
3191 static void
3192 merge_tree_destroy (struct merge_node *merge_tree)
3194 free (merge_tree);
3197 /* Initialize a merge tree node and its descendants. The node's
3198 parent is PARENT. The node and its descendants are taken from the
3199 array of nodes NODE_POOL. Their destination starts at DEST; they
3200 will consume NTHREADS threads. The total number of sort lines is
3201 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3202 its parent. */
3204 static struct merge_node *
3205 init_node (struct merge_node *restrict parent,
3206 struct merge_node *restrict node_pool,
3207 struct line *dest, size_t nthreads,
3208 size_t total_lines, bool is_lo_child)
3210 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3211 size_t nlo = nlines / 2;
3212 size_t nhi = nlines - nlo;
3213 struct line *lo = dest - total_lines;
3214 struct line *hi = lo - nlo;
3215 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3217 struct merge_node *node = node_pool++;
3218 node->lo = node->end_lo = lo;
3219 node->hi = node->end_hi = hi;
3220 node->dest = parent_end;
3221 node->nlo = nlo;
3222 node->nhi = nhi;
3223 node->parent = parent;
3224 node->level = parent->level + 1;
3225 node->queued = false;
3226 pthread_mutex_init (&node->lock, NULL);
3228 if (nthreads > 1)
3230 size_t lo_threads = nthreads / 2;
3231 size_t hi_threads = nthreads - lo_threads;
3232 node->lo_child = node_pool;
3233 node_pool = init_node (node, node_pool, lo, lo_threads,
3234 total_lines, true);
3235 node->hi_child = node_pool;
3236 node_pool = init_node (node, node_pool, hi, hi_threads,
3237 total_lines, false);
3239 else
3241 node->lo_child = NULL;
3242 node->hi_child = NULL;
3244 return node_pool;
3248 /* Compare two merge nodes A and B for priority. */
3250 static int
3251 compare_nodes (void const *a, void const *b)
3253 struct merge_node const *nodea = a;
3254 struct merge_node const *nodeb = b;
3255 if (nodea->level == nodeb->level)
3256 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3257 return nodea->level < nodeb->level;
3260 /* Lock a merge tree NODE. */
3262 static inline void
3263 lock_node (struct merge_node *node)
3265 pthread_mutex_lock (&node->lock);
3268 /* Unlock a merge tree NODE. */
3270 static inline void
3271 unlock_node (struct merge_node *node)
3273 pthread_mutex_unlock (&node->lock);
3276 /* Destroy merge QUEUE. */
3278 static void
3279 queue_destroy (struct merge_node_queue *queue)
3281 heap_free (queue->priority_queue);
3282 pthread_cond_destroy (&queue->cond);
3283 pthread_mutex_destroy (&queue->mutex);
3286 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3287 NTHREADS threads. */
3289 static void
3290 queue_init (struct merge_node_queue *queue, size_t nthreads)
3292 /* Though it's highly unlikely all nodes are in the heap at the same
3293 time, the heap should accommodate all of them. Counting a NULL
3294 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3295 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3296 pthread_mutex_init (&queue->mutex, NULL);
3297 pthread_cond_init (&queue->cond, NULL);
3300 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3301 does not need to lock NODE. */
3303 static void
3304 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3306 pthread_mutex_lock (&queue->mutex);
3307 heap_insert (queue->priority_queue, node);
3308 node->queued = true;
3309 pthread_mutex_unlock (&queue->mutex);
3310 pthread_cond_signal (&queue->cond);
3313 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3315 static struct merge_node *
3316 queue_pop (struct merge_node_queue *queue)
3318 struct merge_node *node;
3319 pthread_mutex_lock (&queue->mutex);
3320 while (! (node = heap_remove_top (queue->priority_queue)))
3321 pthread_cond_wait (&queue->cond, &queue->mutex);
3322 pthread_mutex_unlock (&queue->mutex);
3323 lock_node (node);
3324 node->queued = false;
3325 return node;
3328 /* Output LINE to TFP, unless -u is specified and the line compares
3329 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3330 is null if TFP is standard output.
3332 This function does not save the line for comparison later, so it is
3333 appropriate only for internal sort. */
3335 static void
3336 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3338 static struct line saved;
3340 if (unique)
3342 if (saved.text && ! compare (line, &saved))
3343 return;
3344 saved = *line;
3347 write_line (line, tfp, temp_output);
3350 /* Merge the lines currently available to a NODE in the binary
3351 merge tree. Merge a number of lines appropriate for this merge
3352 level, assuming TOTAL_LINES is the total number of lines.
3354 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3355 the name of TFP, or is null if TFP is standard output. */
3357 static void
3358 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3359 FILE *tfp, char const *temp_output)
3361 struct line *lo_orig = node->lo;
3362 struct line *hi_orig = node->hi;
3363 size_t to_merge = MAX_MERGE (total_lines, node->level);
3364 size_t merged_lo;
3365 size_t merged_hi;
3367 if (node->level > MERGE_ROOT)
3369 /* Merge to destination buffer. */
3370 struct line *dest = *node->dest;
3371 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3372 if (compare (node->lo - 1, node->hi - 1) <= 0)
3373 *--dest = *--node->lo;
3374 else
3375 *--dest = *--node->hi;
3377 merged_lo = lo_orig - node->lo;
3378 merged_hi = hi_orig - node->hi;
3380 if (node->nhi == merged_hi)
3381 while (node->lo != node->end_lo && to_merge--)
3382 *--dest = *--node->lo;
3383 else if (node->nlo == merged_lo)
3384 while (node->hi != node->end_hi && to_merge--)
3385 *--dest = *--node->hi;
3386 *node->dest = dest;
3388 else
3390 /* Merge directly to output. */
3391 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3393 if (compare (node->lo - 1, node->hi - 1) <= 0)
3394 write_unique (--node->lo, tfp, temp_output);
3395 else
3396 write_unique (--node->hi, tfp, temp_output);
3399 merged_lo = lo_orig - node->lo;
3400 merged_hi = hi_orig - node->hi;
3402 if (node->nhi == merged_hi)
3404 while (node->lo != node->end_lo && to_merge--)
3405 write_unique (--node->lo, tfp, temp_output);
3407 else if (node->nlo == merged_lo)
3409 while (node->hi != node->end_hi && to_merge--)
3410 write_unique (--node->hi, tfp, temp_output);
3414 /* Update NODE. */
3415 merged_lo = lo_orig - node->lo;
3416 merged_hi = hi_orig - node->hi;
3417 node->nlo -= merged_lo;
3418 node->nhi -= merged_hi;
3421 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3422 NODE's children has available lines and the other either has
3423 available lines or has exhausted its lines. */
3425 static void
3426 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3428 if (! node->queued)
3430 bool lo_avail = (node->lo - node->end_lo) != 0;
3431 bool hi_avail = (node->hi - node->end_hi) != 0;
3432 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3433 queue_insert (queue, node);
3437 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3439 static void
3440 queue_check_insert_parent (struct merge_node_queue *queue,
3441 struct merge_node *node)
3443 if (node->level > MERGE_ROOT)
3445 lock_node (node->parent);
3446 queue_check_insert (queue, node->parent);
3447 unlock_node (node->parent);
3449 else if (node->nlo + node->nhi == 0)
3451 /* If the MERGE_ROOT NODE has finished merging, insert the
3452 MERGE_END node. */
3453 queue_insert (queue, node->parent);
3457 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3458 some of those lines, until the MERGE_END node is popped.
3459 TOTAL_LINES is the total number of lines. If merging at the top
3460 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3461 null if TFP is standard output. */
3463 static void
3464 merge_loop (struct merge_node_queue *queue,
3465 size_t total_lines, FILE *tfp, char const *temp_output)
3467 while (1)
3469 struct merge_node *node = queue_pop (queue);
3471 if (node->level == MERGE_END)
3473 unlock_node (node);
3474 /* Reinsert so other threads can pop it. */
3475 queue_insert (queue, node);
3476 break;
3478 mergelines_node (node, total_lines, tfp, temp_output);
3479 queue_check_insert (queue, node);
3480 queue_check_insert_parent (queue, node);
3482 unlock_node (node);
3487 static void sortlines (struct line *restrict, size_t, size_t,
3488 struct merge_node *, struct merge_node_queue *,
3489 FILE *, char const *);
3491 /* Thread arguments for sortlines_thread. */
3493 struct thread_args
3495 /* Source, i.e., the array of lines to sort. This points just past
3496 the end of the array. */
3497 struct line *lines;
3499 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3500 size_t nthreads;
3502 /* Number of lines in LINES and DEST. */
3503 size_t const total_lines;
3505 /* Merge node. Lines from this node and this node's sibling will merged
3506 to this node's parent. */
3507 struct merge_node *const node;
3509 /* The priority queue controlling available work for the entire
3510 internal sort. */
3511 struct merge_node_queue *const queue;
3513 /* If at the top level, the file to output to, and the file's name.
3514 If the file is standard output, the file's name is null. */
3515 FILE *tfp;
3516 char const *output_temp;
3519 /* Like sortlines, except with a signature acceptable to pthread_create. */
3521 static void *
3522 sortlines_thread (void *data)
3524 struct thread_args const *args = data;
3525 sortlines (args->lines, args->nthreads, args->total_lines,
3526 args->node, args->queue, args->tfp,
3527 args->output_temp);
3528 return NULL;
3531 /* Sort lines, possibly in parallel. The arguments are as in struct
3532 thread_args above.
3534 The algorithm has three phases: node creation, sequential sort,
3535 and binary merge.
3537 During node creation, sortlines recursively visits each node in the
3538 binary merge tree and creates a NODE structure corresponding to all the
3539 future line merging NODE is responsible for. For each call to
3540 sortlines, half the available threads are assigned to each recursive
3541 call, until a leaf node having only 1 available thread is reached.
3543 Each leaf node then performs two sequential sorts, one on each half of
3544 the lines it is responsible for. It records in its NODE structure that
3545 there are two sorted sublists available to merge from, and inserts its
3546 NODE into the priority queue.
3548 The binary merge phase then begins. Each thread drops into a loop
3549 where the thread retrieves a NODE from the priority queue, merges lines
3550 available to that NODE, and potentially insert NODE or its parent back
3551 into the queue if there are sufficient available lines for them to
3552 merge. This continues until all lines at all nodes of the merge tree
3553 have been merged. */
3555 static void
3556 sortlines (struct line *restrict lines, size_t nthreads,
3557 size_t total_lines, struct merge_node *node,
3558 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3560 size_t nlines = node->nlo + node->nhi;
3562 /* Calculate thread arguments. */
3563 size_t lo_threads = nthreads / 2;
3564 size_t hi_threads = nthreads - lo_threads;
3565 pthread_t thread;
3566 struct thread_args args = {lines, lo_threads, total_lines,
3567 node->lo_child, queue, tfp, temp_output};
3569 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3570 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3572 sortlines (lines - node->nlo, hi_threads, total_lines,
3573 node->hi_child, queue, tfp, temp_output);
3574 pthread_join (thread, NULL);
3576 else
3578 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3579 Sort with 1 thread. */
3580 size_t nlo = node->nlo;
3581 size_t nhi = node->nhi;
3582 struct line *temp = lines - total_lines;
3583 if (1 < nhi)
3584 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3585 if (1 < nlo)
3586 sequential_sort (lines, nlo, temp, false);
3588 /* Update merge NODE. No need to lock yet. */
3589 node->lo = lines;
3590 node->hi = lines - nlo;
3591 node->end_lo = lines - nlo;
3592 node->end_hi = lines - nlo - nhi;
3594 queue_insert (queue, node);
3595 merge_loop (queue, total_lines, tfp, temp_output);
3598 pthread_mutex_destroy (&node->lock);
3601 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3602 the same as OUTFILE. If found, replace each with the same
3603 temporary copy that can be merged into OUTFILE without destroying
3604 OUTFILE before it is completely read. This temporary copy does not
3605 count as a merge temp, so don't worry about incrementing NTEMPS in
3606 the caller; final cleanup will remove it, not zaptemp.
3608 This test ensures that an otherwise-erroneous use like
3609 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3610 It's not clear that POSIX requires this nicety.
3611 Detect common error cases, but don't try to catch obscure cases like
3612 "cat ... FILE ... | sort -m -o FILE"
3613 where traditional "sort" doesn't copy the input and where
3614 people should know that they're getting into trouble anyway.
3615 Catching these obscure cases would slow down performance in
3616 common cases. */
3618 static void
3619 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3620 size_t nfiles, char const *outfile)
3622 size_t i;
3623 bool got_outstat = false;
3624 struct stat outstat;
3625 struct tempnode *tempcopy = NULL;
3627 for (i = ntemps; i < nfiles; i++)
3629 bool is_stdin = STREQ (files[i].name, "-");
3630 bool same;
3631 struct stat instat;
3633 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3634 same = true;
3635 else
3637 if (! got_outstat)
3639 if ((outfile
3640 ? stat (outfile, &outstat)
3641 : fstat (STDOUT_FILENO, &outstat))
3642 != 0)
3643 break;
3644 got_outstat = true;
3647 same = (((is_stdin
3648 ? fstat (STDIN_FILENO, &instat)
3649 : stat (files[i].name, &instat))
3650 == 0)
3651 && SAME_INODE (instat, outstat));
3654 if (same)
3656 if (! tempcopy)
3658 FILE *tftp;
3659 tempcopy = create_temp (&tftp);
3660 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3663 files[i].name = tempcopy->name;
3664 files[i].temp = tempcopy;
3669 /* Merge the input FILES. NTEMPS is the number of files at the
3670 start of FILES that are temporary; it is zero at the top level.
3671 NFILES is the total number of files. Put the output in
3672 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3674 static void
3675 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3676 char const *output_file)
3678 while (nmerge < nfiles)
3680 /* Number of input files processed so far. */
3681 size_t in;
3683 /* Number of output files generated so far. */
3684 size_t out;
3686 /* nfiles % NMERGE; this counts input files that are left over
3687 after all full-sized merges have been done. */
3688 size_t remainder;
3690 /* Number of easily-available slots at the next loop iteration. */
3691 size_t cheap_slots;
3693 /* Do as many NMERGE-size merges as possible. In the case that
3694 nmerge is bogus, increment by the maximum number of file
3695 descriptors allowed. */
3696 for (out = in = 0; nmerge <= nfiles - in; out++)
3698 FILE *tfp;
3699 struct tempnode *temp = create_temp (&tfp);
3700 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3701 nmerge, tfp, temp->name);
3702 ntemps -= MIN (ntemps, num_merged);
3703 files[out].name = temp->name;
3704 files[out].temp = temp;
3705 in += num_merged;
3708 remainder = nfiles - in;
3709 cheap_slots = nmerge - out % nmerge;
3711 if (cheap_slots < remainder)
3713 /* So many files remain that they can't all be put into the last
3714 NMERGE-sized output window. Do one more merge. Merge as few
3715 files as possible, to avoid needless I/O. */
3716 size_t nshortmerge = remainder - cheap_slots + 1;
3717 FILE *tfp;
3718 struct tempnode *temp = create_temp (&tfp);
3719 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3720 nshortmerge, tfp, temp->name);
3721 ntemps -= MIN (ntemps, num_merged);
3722 files[out].name = temp->name;
3723 files[out++].temp = temp;
3724 in += num_merged;
3727 /* Put the remaining input files into the last NMERGE-sized output
3728 window, so they will be merged in the next pass. */
3729 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3730 ntemps += out;
3731 nfiles -= in - out;
3734 avoid_trashing_input (files, ntemps, nfiles, output_file);
3736 /* We aren't guaranteed that this final mergefiles will work, therefore we
3737 try to merge into the output, and then merge as much as we can into a
3738 temp file if we can't. Repeat. */
3740 while (true)
3742 /* Merge directly into the output file if possible. */
3743 FILE **fps;
3744 size_t nopened = open_input_files (files, nfiles, &fps);
3746 if (nopened == nfiles)
3748 FILE *ofp = stream_open (output_file, "w");
3749 if (ofp)
3751 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3752 break;
3754 if (errno != EMFILE || nopened <= 2)
3755 die (_("open failed"), output_file);
3757 else if (nopened <= 2)
3758 die (_("open failed"), files[nopened].name);
3760 /* We ran out of file descriptors. Close one of the input
3761 files, to gain a file descriptor. Then create a temporary
3762 file with our spare file descriptor. Retry if that failed
3763 (e.g., some other process could open a file between the time
3764 we closed and tried to create). */
3765 FILE *tfp;
3766 struct tempnode *temp;
3769 nopened--;
3770 xfclose (fps[nopened], files[nopened].name);
3771 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3773 while (!temp);
3775 /* Merge into the newly allocated temporary. */
3776 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3777 fps);
3778 ntemps -= MIN (ntemps, nopened);
3779 files[0].name = temp->name;
3780 files[0].temp = temp;
3782 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3783 ntemps++;
3784 nfiles -= nopened - 1;
3788 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3790 static void
3791 sort (char *const *files, size_t nfiles, char const *output_file,
3792 size_t nthreads)
3794 struct buffer buf;
3795 IF_LINT (buf.buf = NULL);
3796 size_t ntemps = 0;
3797 bool output_file_created = false;
3799 buf.alloc = 0;
3801 while (nfiles)
3803 char const *temp_output;
3804 char const *file = *files;
3805 FILE *fp = xfopen (file, "r");
3806 FILE *tfp;
3808 size_t bytes_per_line;
3809 if (nthreads > 1)
3811 /* Get log P. */
3812 size_t tmp = 1;
3813 size_t mult = 1;
3814 while (tmp < nthreads)
3816 tmp *= 2;
3817 mult++;
3819 bytes_per_line = (mult * sizeof (struct line));
3821 else
3822 bytes_per_line = sizeof (struct line) * 3 / 2;
3824 if (! buf.alloc)
3825 initbuf (&buf, bytes_per_line,
3826 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3827 buf.eof = false;
3828 files++;
3829 nfiles--;
3831 while (fillbuf (&buf, fp, file))
3833 struct line *line;
3835 if (buf.eof && nfiles
3836 && (bytes_per_line + 1
3837 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3839 /* End of file, but there is more input and buffer room.
3840 Concatenate the next input file; this is faster in
3841 the usual case. */
3842 buf.left = buf.used;
3843 break;
3846 line = buffer_linelim (&buf);
3847 if (buf.eof && !nfiles && !ntemps && !buf.left)
3849 xfclose (fp, file);
3850 tfp = xfopen (output_file, "w");
3851 temp_output = output_file;
3852 output_file_created = true;
3854 else
3856 ++ntemps;
3857 temp_output = create_temp (&tfp)->name;
3859 if (1 < buf.nlines)
3861 struct merge_node_queue queue;
3862 queue_init (&queue, nthreads);
3863 struct merge_node *merge_tree =
3864 merge_tree_init (nthreads, buf.nlines, line);
3865 struct merge_node *root = merge_tree + 1;
3867 sortlines (line, nthreads, buf.nlines, root,
3868 &queue, tfp, temp_output);
3869 queue_destroy (&queue);
3870 pthread_mutex_destroy (&root->lock);
3871 merge_tree_destroy (merge_tree);
3873 else
3874 write_unique (line - 1, tfp, temp_output);
3876 xfclose (tfp, temp_output);
3878 if (output_file_created)
3879 goto finish;
3881 xfclose (fp, file);
3884 finish:
3885 free (buf.buf);
3887 if (! output_file_created)
3889 size_t i;
3890 struct tempnode *node = temphead;
3891 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3892 for (i = 0; node; i++)
3894 tempfiles[i].name = node->name;
3895 tempfiles[i].temp = node;
3896 node = node->next;
3898 merge (tempfiles, ntemps, ntemps, output_file);
3899 free (tempfiles);
3902 reap_all ();
3905 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3907 static void
3908 insertkey (struct keyfield *key_arg)
3910 struct keyfield **p;
3911 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3913 for (p = &keylist; *p; p = &(*p)->next)
3914 continue;
3915 *p = key;
3916 key->next = NULL;
3919 /* Report a bad field specification SPEC, with extra info MSGID. */
3921 static void badfieldspec (char const *, char const *)
3922 ATTRIBUTE_NORETURN;
3923 static void
3924 badfieldspec (char const *spec, char const *msgid)
3926 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3927 _(msgid), quote (spec));
3928 abort ();
3931 /* Report incompatible options. */
3933 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3934 static void
3935 incompatible_options (char const *opts)
3937 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), opts);
3938 abort ();
3941 /* Check compatibility of ordering options. */
3943 static void
3944 check_ordering_compatibility (void)
3946 struct keyfield *key;
3948 for (key = keylist; key; key = key->next)
3949 if (1 < (key->numeric + key->general_numeric + key->human_numeric
3950 + key->month + (key->version | key->random | !!key->ignore)))
3952 /* The following is too big, but guaranteed to be "big enough". */
3953 char opts[sizeof short_options];
3954 /* Clear flags we're not interested in. */
3955 key->skipsblanks = key->skipeblanks = key->reverse = false;
3956 key_to_opts (key, opts);
3957 incompatible_options (opts);
3961 /* Parse the leading integer in STRING and store the resulting value
3962 (which must fit into size_t) into *VAL. Return the address of the
3963 suffix after the integer. If the value is too large, silently
3964 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3965 failure; otherwise, report MSGID and exit on failure. */
3967 static char const *
3968 parse_field_count (char const *string, size_t *val, char const *msgid)
3970 char *suffix;
3971 uintmax_t n;
3973 switch (xstrtoumax (string, &suffix, 10, &n, ""))
3975 case LONGINT_OK:
3976 case LONGINT_INVALID_SUFFIX_CHAR:
3977 *val = n;
3978 if (*val == n)
3979 break;
3980 /* Fall through. */
3981 case LONGINT_OVERFLOW:
3982 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3983 *val = SIZE_MAX;
3984 break;
3986 case LONGINT_INVALID:
3987 if (msgid)
3988 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3989 _(msgid), quote (string));
3990 return NULL;
3993 return suffix;
3996 /* Handle interrupts and hangups. */
3998 static void
3999 sighandler (int sig)
4001 if (! SA_NOCLDSTOP)
4002 signal (sig, SIG_IGN);
4004 cleanup ();
4006 signal (sig, SIG_DFL);
4007 raise (sig);
4010 /* Set the ordering options for KEY specified in S.
4011 Return the address of the first character in S that
4012 is not a valid ordering option.
4013 BLANKTYPE is the kind of blanks that 'b' should skip. */
4015 static char *
4016 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4018 while (*s)
4020 switch (*s)
4022 case 'b':
4023 if (blanktype == bl_start || blanktype == bl_both)
4024 key->skipsblanks = true;
4025 if (blanktype == bl_end || blanktype == bl_both)
4026 key->skipeblanks = true;
4027 break;
4028 case 'd':
4029 key->ignore = nondictionary;
4030 break;
4031 case 'f':
4032 key->translate = fold_toupper;
4033 break;
4034 case 'g':
4035 key->general_numeric = true;
4036 break;
4037 case 'h':
4038 key->human_numeric = true;
4039 break;
4040 case 'i':
4041 /* Option order should not matter, so don't let -i override
4042 -d. -d implies -i, but -i does not imply -d. */
4043 if (! key->ignore)
4044 key->ignore = nonprinting;
4045 break;
4046 case 'M':
4047 key->month = true;
4048 break;
4049 case 'n':
4050 key->numeric = true;
4051 break;
4052 case 'R':
4053 key->random = true;
4054 break;
4055 case 'r':
4056 key->reverse = true;
4057 break;
4058 case 'V':
4059 key->version = true;
4060 break;
4061 default:
4062 return (char *) s;
4064 ++s;
4066 return (char *) s;
4069 /* Initialize KEY. */
4071 static struct keyfield *
4072 key_init (struct keyfield *key)
4074 memset (key, 0, sizeof *key);
4075 key->eword = SIZE_MAX;
4076 return key;
4080 main (int argc, char **argv)
4082 struct keyfield *key;
4083 struct keyfield key_buf;
4084 struct keyfield gkey;
4085 bool gkey_only = false;
4086 char const *s;
4087 int c = 0;
4088 char checkonly = 0;
4089 bool mergeonly = false;
4090 char *random_source = NULL;
4091 bool need_random = false;
4092 size_t nthreads = 0;
4093 size_t nfiles = 0;
4094 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4095 bool obsolete_usage = (posix2_version () < 200112);
4096 char **files;
4097 char *files_from = NULL;
4098 struct Tokens tok;
4099 char const *outfile = NULL;
4101 initialize_main (&argc, &argv);
4102 set_program_name (argv[0]);
4103 setlocale (LC_ALL, "");
4104 bindtextdomain (PACKAGE, LOCALEDIR);
4105 textdomain (PACKAGE);
4107 initialize_exit_failure (SORT_FAILURE);
4109 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4110 #if HAVE_NL_LANGINFO
4111 hard_LC_TIME = hard_locale (LC_TIME);
4112 #endif
4114 /* Get locale's representation of the decimal point. */
4116 struct lconv const *locale = localeconv ();
4118 /* If the locale doesn't define a decimal point, or if the decimal
4119 point is multibyte, use the C locale's decimal point. FIXME:
4120 add support for multibyte decimal points. */
4121 decimal_point = to_uchar (locale->decimal_point[0]);
4122 if (! decimal_point || locale->decimal_point[1])
4123 decimal_point = '.';
4125 /* FIXME: add support for multibyte thousands separators. */
4126 thousands_sep = to_uchar (*locale->thousands_sep);
4127 if (! thousands_sep || locale->thousands_sep[1])
4128 thousands_sep = -1;
4131 have_read_stdin = false;
4132 inittables ();
4135 size_t i;
4136 static int const sig[] =
4138 /* The usual suspects. */
4139 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4140 #ifdef SIGPOLL
4141 SIGPOLL,
4142 #endif
4143 #ifdef SIGPROF
4144 SIGPROF,
4145 #endif
4146 #ifdef SIGVTALRM
4147 SIGVTALRM,
4148 #endif
4149 #ifdef SIGXCPU
4150 SIGXCPU,
4151 #endif
4152 #ifdef SIGXFSZ
4153 SIGXFSZ,
4154 #endif
4156 enum { nsigs = ARRAY_CARDINALITY (sig) };
4158 #if SA_NOCLDSTOP
4159 struct sigaction act;
4161 sigemptyset (&caught_signals);
4162 for (i = 0; i < nsigs; i++)
4164 sigaction (sig[i], NULL, &act);
4165 if (act.sa_handler != SIG_IGN)
4166 sigaddset (&caught_signals, sig[i]);
4169 act.sa_handler = sighandler;
4170 act.sa_mask = caught_signals;
4171 act.sa_flags = 0;
4173 for (i = 0; i < nsigs; i++)
4174 if (sigismember (&caught_signals, sig[i]))
4175 sigaction (sig[i], &act, NULL);
4176 #else
4177 for (i = 0; i < nsigs; i++)
4178 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4180 signal (sig[i], sighandler);
4181 siginterrupt (sig[i], 1);
4183 #endif
4185 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4187 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4188 atexit (exit_cleanup);
4190 key_init (&gkey);
4191 gkey.sword = SIZE_MAX;
4193 files = xnmalloc (argc, sizeof *files);
4195 while (true)
4197 /* Parse an operand as a file after "--" was seen; or if
4198 pedantic and a file was seen, unless the POSIX version
4199 predates 1003.1-2001 and -c was not seen and the operand is
4200 "-o FILE" or "-oFILE". */
4201 int oi = -1;
4203 if (c == -1
4204 || (posixly_correct && nfiles != 0
4205 && ! (obsolete_usage
4206 && ! checkonly
4207 && optind != argc
4208 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4209 && (argv[optind][2] || optind + 1 != argc)))
4210 || ((c = getopt_long (argc, argv, short_options,
4211 long_options, &oi))
4212 == -1))
4214 if (argc <= optind)
4215 break;
4216 files[nfiles++] = argv[optind++];
4218 else switch (c)
4220 case 1:
4221 key = NULL;
4222 if (optarg[0] == '+')
4224 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4225 && ISDIGIT (argv[optind][1]));
4226 obsolete_usage |= minus_pos_usage && !posixly_correct;
4227 if (obsolete_usage)
4229 /* Treat +POS1 [-POS2] as a key if possible; but silently
4230 treat an operand as a file if it is not a valid +POS1. */
4231 key = key_init (&key_buf);
4232 s = parse_field_count (optarg + 1, &key->sword, NULL);
4233 if (s && *s == '.')
4234 s = parse_field_count (s + 1, &key->schar, NULL);
4235 if (! (key->sword || key->schar))
4236 key->sword = SIZE_MAX;
4237 if (! s || *set_ordering (s, key, bl_start))
4238 key = NULL;
4239 else
4241 if (minus_pos_usage)
4243 char const *optarg1 = argv[optind++];
4244 s = parse_field_count (optarg1 + 1, &key->eword,
4245 N_("invalid number after '-'"));
4246 /* When called with a non-NULL message ID,
4247 parse_field_count cannot return NULL. Tell static
4248 analysis tools that dereferencing S is safe. */
4249 assert (s);
4250 if (*s == '.')
4251 s = parse_field_count (s + 1, &key->echar,
4252 N_("invalid number after '.'"));
4253 if (!key->echar && key->eword)
4255 /* obsolescent syntax +A.x -B.y is equivalent to:
4256 -k A+1.x+1,B.y (when y = 0)
4257 -k A+1.x+1,B+1.y (when y > 0)
4258 So eword is decremented as in the -k case
4259 only when the end field (B) is specified and
4260 echar (y) is 0. */
4261 key->eword--;
4263 if (*set_ordering (s, key, bl_end))
4264 badfieldspec (optarg1,
4265 N_("stray character in field spec"));
4267 key->obsolete_used = true;
4268 insertkey (key);
4272 if (! key)
4273 files[nfiles++] = optarg;
4274 break;
4276 case SORT_OPTION:
4277 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4278 /* Fall through. */
4279 case 'b':
4280 case 'd':
4281 case 'f':
4282 case 'g':
4283 case 'h':
4284 case 'i':
4285 case 'M':
4286 case 'n':
4287 case 'r':
4288 case 'R':
4289 case 'V':
4291 char str[2];
4292 str[0] = c;
4293 str[1] = '\0';
4294 set_ordering (str, &gkey, bl_both);
4296 break;
4298 case CHECK_OPTION:
4299 c = (optarg
4300 ? XARGMATCH ("--check", optarg, check_args, check_types)
4301 : 'c');
4302 /* Fall through. */
4303 case 'c':
4304 case 'C':
4305 if (checkonly && checkonly != c)
4306 incompatible_options ("cC");
4307 checkonly = c;
4308 break;
4310 case COMPRESS_PROGRAM_OPTION:
4311 if (compress_program && !STREQ (compress_program, optarg))
4312 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4313 compress_program = optarg;
4314 break;
4316 case DEBUG_PROGRAM_OPTION:
4317 debug = true;
4318 break;
4320 case FILES0_FROM_OPTION:
4321 files_from = optarg;
4322 break;
4324 case 'k':
4325 key = key_init (&key_buf);
4327 /* Get POS1. */
4328 s = parse_field_count (optarg, &key->sword,
4329 N_("invalid number at field start"));
4330 if (! key->sword--)
4332 /* Provoke with 'sort -k0' */
4333 badfieldspec (optarg, N_("field number is zero"));
4335 if (*s == '.')
4337 s = parse_field_count (s + 1, &key->schar,
4338 N_("invalid number after '.'"));
4339 if (! key->schar--)
4341 /* Provoke with 'sort -k1.0' */
4342 badfieldspec (optarg, N_("character offset is zero"));
4345 if (! (key->sword || key->schar))
4346 key->sword = SIZE_MAX;
4347 s = set_ordering (s, key, bl_start);
4348 if (*s != ',')
4350 key->eword = SIZE_MAX;
4351 key->echar = 0;
4353 else
4355 /* Get POS2. */
4356 s = parse_field_count (s + 1, &key->eword,
4357 N_("invalid number after ','"));
4358 if (! key->eword--)
4360 /* Provoke with 'sort -k1,0' */
4361 badfieldspec (optarg, N_("field number is zero"));
4363 if (*s == '.')
4365 s = parse_field_count (s + 1, &key->echar,
4366 N_("invalid number after '.'"));
4368 s = set_ordering (s, key, bl_end);
4370 if (*s)
4371 badfieldspec (optarg, N_("stray character in field spec"));
4372 insertkey (key);
4373 break;
4375 case 'm':
4376 mergeonly = true;
4377 break;
4379 case NMERGE_OPTION:
4380 specify_nmerge (oi, c, optarg);
4381 break;
4383 case 'o':
4384 if (outfile && !STREQ (outfile, optarg))
4385 error (SORT_FAILURE, 0, _("multiple output files specified"));
4386 outfile = optarg;
4387 break;
4389 case RANDOM_SOURCE_OPTION:
4390 if (random_source && !STREQ (random_source, optarg))
4391 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4392 random_source = optarg;
4393 break;
4395 case 's':
4396 stable = true;
4397 break;
4399 case 'S':
4400 specify_sort_size (oi, c, optarg);
4401 break;
4403 case 't':
4405 char newtab = optarg[0];
4406 if (! newtab)
4407 error (SORT_FAILURE, 0, _("empty tab"));
4408 if (optarg[1])
4410 if (STREQ (optarg, "\\0"))
4411 newtab = '\0';
4412 else
4414 /* Provoke with 'sort -txx'. Complain about
4415 "multi-character tab" instead of "multibyte tab", so
4416 that the diagnostic's wording does not need to be
4417 changed once multibyte characters are supported. */
4418 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4419 quote (optarg));
4422 if (tab != TAB_DEFAULT && tab != newtab)
4423 error (SORT_FAILURE, 0, _("incompatible tabs"));
4424 tab = newtab;
4426 break;
4428 case 'T':
4429 add_temp_dir (optarg);
4430 break;
4432 case PARALLEL_OPTION:
4433 nthreads = specify_nthreads (oi, c, optarg);
4434 break;
4436 case 'u':
4437 unique = true;
4438 break;
4440 case 'y':
4441 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4442 through Solaris 7. It is also accepted by many non-Solaris
4443 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4444 -y is marked as obsolete starting with Solaris 8 (1999), but is
4445 still accepted as of Solaris 10 prerelease (2004).
4447 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4448 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4449 and which in general ignores the argument after "-y" if it
4450 consists entirely of digits (it can even be empty). */
4451 if (optarg == argv[optind - 1])
4453 char const *p;
4454 for (p = optarg; ISDIGIT (*p); p++)
4455 continue;
4456 optind -= (*p != '\0');
4458 break;
4460 case 'z':
4461 eolchar = 0;
4462 break;
4464 case_GETOPT_HELP_CHAR;
4466 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4468 default:
4469 usage (SORT_FAILURE);
4473 if (files_from)
4475 FILE *stream;
4477 /* When using --files0-from=F, you may not specify any files
4478 on the command-line. */
4479 if (nfiles)
4481 error (0, 0, _("extra operand %s"), quote (files[0]));
4482 fprintf (stderr, "%s\n",
4483 _("file operands cannot be combined with --files0-from"));
4484 usage (SORT_FAILURE);
4487 if (STREQ (files_from, "-"))
4488 stream = stdin;
4489 else
4491 stream = fopen (files_from, "r");
4492 if (stream == NULL)
4493 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4494 quote (files_from));
4497 readtokens0_init (&tok);
4499 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4500 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4501 quote (files_from));
4503 if (tok.n_tok)
4505 size_t i;
4506 free (files);
4507 files = tok.tok;
4508 nfiles = tok.n_tok;
4509 for (i = 0; i < nfiles; i++)
4511 if (STREQ (files[i], "-"))
4512 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4513 "no file name of %s allowed"),
4514 quote (files[i]));
4515 else if (files[i][0] == '\0')
4517 /* Using the standard 'filename:line-number:' prefix here is
4518 not totally appropriate, since NUL is the separator,
4519 not NL, but it might be better than nothing. */
4520 unsigned long int file_number = i + 1;
4521 error (SORT_FAILURE, 0,
4522 _("%s:%lu: invalid zero-length file name"),
4523 quotearg_colon (files_from), file_number);
4527 else
4528 error (SORT_FAILURE, 0, _("no input from %s"),
4529 quote (files_from));
4532 /* Inheritance of global options to individual keys. */
4533 for (key = keylist; key; key = key->next)
4535 if (default_key_compare (key) && !key->reverse)
4537 key->ignore = gkey.ignore;
4538 key->translate = gkey.translate;
4539 key->skipsblanks = gkey.skipsblanks;
4540 key->skipeblanks = gkey.skipeblanks;
4541 key->month = gkey.month;
4542 key->numeric = gkey.numeric;
4543 key->general_numeric = gkey.general_numeric;
4544 key->human_numeric = gkey.human_numeric;
4545 key->version = gkey.version;
4546 key->random = gkey.random;
4547 key->reverse = gkey.reverse;
4550 need_random |= key->random;
4553 if (!keylist && !default_key_compare (&gkey))
4555 gkey_only = true;
4556 insertkey (&gkey);
4557 need_random |= gkey.random;
4560 check_ordering_compatibility ();
4562 if (debug)
4564 if (checkonly || outfile)
4566 static char opts[] = "X --debug";
4567 opts[0] = (checkonly ? checkonly : 'o');
4568 incompatible_options (opts);
4571 /* Always output the locale in debug mode, since this
4572 is such a common source of confusion. */
4573 if (hard_LC_COLLATE)
4574 error (0, 0, _("using %s sorting rules"),
4575 quote (setlocale (LC_COLLATE, NULL)));
4576 else
4577 error (0, 0, _("using simple byte comparison"));
4578 key_warnings (&gkey, gkey_only);
4581 reverse = gkey.reverse;
4583 if (need_random)
4584 random_md5_state_init (random_source);
4586 if (temp_dir_count == 0)
4588 char const *tmp_dir = getenv ("TMPDIR");
4589 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4592 if (nfiles == 0)
4594 static char *minus = (char *) "-";
4595 nfiles = 1;
4596 free (files);
4597 files = &minus;
4600 /* Need to re-check that we meet the minimum requirement for memory
4601 usage with the final value for NMERGE. */
4602 if (0 < sort_size)
4603 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4605 if (checkonly)
4607 if (nfiles > 1)
4608 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4609 quote (files[1]), checkonly);
4611 if (outfile)
4613 static char opts[] = {0, 'o', 0};
4614 opts[0] = checkonly;
4615 incompatible_options (opts);
4618 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4619 input is not properly sorted. */
4620 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4623 if (mergeonly)
4625 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4626 size_t i;
4628 for (i = 0; i < nfiles; ++i)
4629 sortfiles[i].name = files[i];
4631 merge (sortfiles, 0, nfiles, outfile);
4632 IF_LINT (free (sortfiles));
4634 else
4636 if (!nthreads)
4638 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4639 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4642 /* Avoid integer overflow later. */
4643 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4644 nthreads = MIN (nthreads, nthreads_max);
4646 sort (files, nfiles, outfile, nthreads);
4649 if (have_read_stdin && fclose (stdin) == EOF)
4650 die (_("close failed"), "-");
4652 exit (EXIT_SUCCESS);