1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/resource.h>
28 #include <sys/types.h>
36 #include "filevercmp.h"
37 #include "hard-locale.h"
40 #include "ignore-value.h"
49 #include "readtokens0.h"
52 #include "strnumcmp.h"
54 #include "xnanosleep.h"
58 struct rlimit
{ size_t rlim_cur
; };
59 # define getrlimit(Resource, Rlp) (-1)
62 /* The official name of this program (e.g., no 'g' prefix). */
63 #define PROGRAM_NAME "sort"
66 proper_name ("Mike Haertel"), \
67 proper_name ("Paul Eggert")
69 #if HAVE_LANGINFO_CODESET
70 # include <langinfo.h>
73 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
76 # define SA_NOCLDSTOP 0
77 /* No sigprocmask. Always 'return' zero. */
78 # define sigprocmask(How, Set, Oset) (0)
80 # if ! HAVE_SIGINTERRUPT
81 # define siginterrupt(sig, flag) /* empty */
85 #if !defined OPEN_MAX && defined NR_OPEN
86 # define OPEN_MAX NR_OPEN
92 #define UCHAR_LIM (UCHAR_MAX + 1)
95 # define long_double long double
97 # define long_double double
99 # define strtold strtod
102 #ifndef DEFAULT_TMPDIR
103 # define DEFAULT_TMPDIR "/tmp"
106 /* Maximum number of lines to merge every time a NODE is taken from
107 the merge queue. Node is at LEVEL in the binary merge tree,
108 and is responsible for merging TOTAL lines. */
109 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
111 /* Heuristic value for the number of lines for which it is worth creating
112 a subthread, during an internal merge sort. I.e., it is a small number
113 of "average" lines for which sorting via two threads is faster than
114 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
115 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
116 lines of gensort -a output is sorted slightly faster with --parallel=2
117 than with --parallel=1. By contrast, using --parallel=1 is about 10%
118 faster than using --parallel=2 with a 64K-line input. */
119 enum { SUBTHREAD_LINES_HEURISTIC
= 128 * 1024 };
120 verify (4 <= SUBTHREAD_LINES_HEURISTIC
);
122 /* The number of threads after which there are
123 diminishing performance gains. */
124 enum { DEFAULT_MAX_THREADS
= 8 };
129 /* POSIX says to exit with status 1 if invoked with -c and the
130 input is not properly sorted. */
131 SORT_OUT_OF_ORDER
= 1,
133 /* POSIX says any other irregular exit must exit with a status
134 code greater than 1. */
140 /* The number of times we should try to fork a compression process
141 (we retry if the fork call fails). We don't _need_ to compress
142 temp files, this is just to reduce disk access, so this number
143 can be small. Each retry doubles in duration. */
144 MAX_FORK_TRIES_COMPRESS
= 4,
146 /* The number of times we should try to fork a decompression process.
147 If we can't fork a decompression process, we can't sort, so this
148 number should be big. Each retry doubles in duration. */
149 MAX_FORK_TRIES_DECOMPRESS
= 9
154 /* Level of the end-of-merge node, one level above the root. */
157 /* Level of the root node in merge tree. */
161 /* The representation of the decimal point in the current locale. */
162 static int decimal_point
;
164 /* Thousands separator; if -1, then there isn't one. */
165 static int thousands_sep
;
167 /* Nonzero if the corresponding locales are hard. */
168 static bool hard_LC_COLLATE
;
170 static bool hard_LC_TIME
;
173 #define NONZERO(x) ((x) != 0)
175 /* The kind of blanks for '-b' to skip in various options. */
176 enum blanktype
{ bl_start
, bl_end
, bl_both
};
178 /* The character marking end of line. Default to \n. */
179 static char eolchar
= '\n';
181 /* Lines are held in core as counted strings. */
184 char *text
; /* Text of the line. */
185 size_t length
; /* Length including final newline. */
186 char *keybeg
; /* Start of first key. */
187 char *keylim
; /* Limit of first key. */
193 char *buf
; /* Dynamically allocated buffer,
194 partitioned into 3 regions:
197 - an array of lines, in reverse order. */
198 size_t used
; /* Number of bytes used for input data. */
199 size_t nlines
; /* Number of lines in the line array. */
200 size_t alloc
; /* Number of bytes allocated. */
201 size_t left
; /* Number of bytes left from previous reads. */
202 size_t line_bytes
; /* Number of bytes to reserve for each line. */
203 bool eof
; /* An EOF has been read. */
209 size_t sword
; /* Zero-origin 'word' to start at. */
210 size_t schar
; /* Additional characters to skip. */
211 size_t eword
; /* Zero-origin last 'word' of key. */
212 size_t echar
; /* Additional characters in field. */
213 bool const *ignore
; /* Boolean array of characters to ignore. */
214 char const *translate
; /* Translation applied to characters. */
215 bool skipsblanks
; /* Skip leading blanks when finding start. */
216 bool skipeblanks
; /* Skip leading blanks when finding end. */
217 bool numeric
; /* Flag for numeric comparison. Handle
218 strings of digits with optional decimal
219 point, but no exponential notation. */
220 bool random
; /* Sort by random hash of key. */
221 bool general_numeric
; /* Flag for general, numeric comparison.
222 Handle numbers in exponential notation. */
223 bool human_numeric
; /* Flag for sorting by human readable
224 units with either SI xor IEC prefixes. */
225 bool month
; /* Flag for comparison by month name. */
226 bool reverse
; /* Reverse the sense of comparison. */
227 bool version
; /* sort by version number */
228 bool obsolete_used
; /* obsolescent key option format is used. */
229 struct keyfield
*next
; /* Next keyfield to try. */
238 /* Binary merge tree node. */
241 struct line
*lo
; /* Lines to merge from LO child node. */
242 struct line
*hi
; /* Lines to merge from HI child ndoe. */
243 struct line
*end_lo
; /* End of available lines from LO. */
244 struct line
*end_hi
; /* End of available lines from HI. */
245 struct line
**dest
; /* Pointer to destination of merge. */
246 size_t nlo
; /* Total Lines remaining from LO. */
247 size_t nhi
; /* Total lines remaining from HI. */
248 struct merge_node
*parent
; /* Parent node. */
249 struct merge_node
*lo_child
; /* LO child node. */
250 struct merge_node
*hi_child
; /* HI child node. */
251 unsigned int level
; /* Level in merge tree. */
252 bool queued
; /* Node is already in heap. */
253 pthread_mutex_t lock
; /* Lock for node operations. */
256 /* Priority queue of merge nodes. */
257 struct merge_node_queue
259 struct heap
*priority_queue
; /* Priority queue of merge tree nodes. */
260 pthread_mutex_t mutex
; /* Lock for queue operations. */
261 pthread_cond_t cond
; /* Conditional wait for empty queue to populate
265 /* FIXME: None of these tables work with multibyte character sets.
266 Also, there are many other bugs when handling multibyte characters.
267 One way to fix this is to rewrite 'sort' to use wide characters
268 internally, but doing this with good performance is a bit
271 /* Table of blanks. */
272 static bool blanks
[UCHAR_LIM
];
274 /* Table of non-printing characters. */
275 static bool nonprinting
[UCHAR_LIM
];
277 /* Table of non-dictionary characters (not letters, digits, or blanks). */
278 static bool nondictionary
[UCHAR_LIM
];
280 /* Translation table folding lower case to upper. */
281 static char fold_toupper
[UCHAR_LIM
];
283 #define MONTHS_PER_YEAR 12
285 /* Table mapping month names to integers.
286 Alphabetic order allows binary search. */
287 static struct month monthtab
[] =
303 /* During the merge phase, the number of files to merge at once. */
304 #define NMERGE_DEFAULT 16
306 /* Minimum size for a merge or check buffer. */
307 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
309 /* Minimum sort size; the code might not work with smaller sizes. */
310 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
312 /* The number of bytes needed for a merge or check buffer, which can
313 function relatively efficiently even if it holds only one line. If
314 a longer line is seen, this value is increased. */
315 static size_t merge_buffer_size
= MAX (MIN_MERGE_BUFFER_SIZE
, 256 * 1024);
317 /* The approximate maximum number of bytes of main memory to use, as
318 specified by the user. Zero if the user has not specified a size. */
319 static size_t sort_size
;
321 /* The initial allocation factor for non-regular files.
322 This is used, e.g., when reading from a pipe.
323 Don't make it too big, since it is multiplied by ~130 to
324 obtain the size of the actual buffer sort will allocate.
325 Also, there may be 8 threads all doing this at the same time. */
326 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
328 /* Array of directory names in which any temporary files are to be created. */
329 static char const **temp_dirs
;
331 /* Number of temporary directory names used. */
332 static size_t temp_dir_count
;
334 /* Number of allocated slots in temp_dirs. */
335 static size_t temp_dir_alloc
;
337 /* Flag to reverse the order of all comparisons. */
340 /* Flag for stable sort. This turns off the last ditch bytewise
341 comparison of lines, and instead leaves lines in the same order
342 they were read if all keys compare equal. */
345 /* If TAB has this value, blanks separate fields. */
346 enum { TAB_DEFAULT
= CHAR_MAX
+ 1 };
348 /* Tab character separating fields. If TAB_DEFAULT, then fields are
349 separated by the empty string between a non-blank character and a blank
351 static int tab
= TAB_DEFAULT
;
353 /* Flag to remove consecutive duplicate lines from the output.
354 Only the last of a sequence of equal lines will be output. */
357 /* Nonzero if any of the input files are the standard input. */
358 static bool have_read_stdin
;
360 /* List of key field comparisons to be tried. */
361 static struct keyfield
*keylist
;
363 /* Program used to (de)compress temp files. Must accept -d. */
364 static char const *compress_program
;
366 /* Annotate the output with extra info to aid the user. */
369 /* Maximum number of files to merge in one go. If more than this
370 number are present, temp files will be used. */
371 static unsigned int nmerge
= NMERGE_DEFAULT
;
373 /* Report MESSAGE for FILE, then clean up and exit.
374 If FILE is null, it represents standard output. */
376 static void die (char const *, char const *) ATTRIBUTE_NORETURN
;
378 die (char const *message
, char const *file
)
380 error (0, errno
, "%s: %s", message
, file
? file
: _("standard output"));
387 if (status
!= EXIT_SUCCESS
)
392 Usage: %s [OPTION]... [FILE]...\n\
393 or: %s [OPTION]... --files0-from=F\n\
395 program_name
, program_name
);
397 Write sorted concatenation of all FILE(s) to standard output.\n\
401 Mandatory arguments to long options are mandatory for short options too.\n\
408 -b, --ignore-leading-blanks ignore leading blanks\n\
409 -d, --dictionary-order consider only blanks and alphanumeric characters\
411 -f, --ignore-case fold lower case to upper case characters\n\
414 -g, --general-numeric-sort compare according to general numerical value\n\
415 -i, --ignore-nonprinting consider only printable characters\n\
416 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
419 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
422 -n, --numeric-sort compare according to string numerical value\n\
423 -R, --random-sort sort by random hash of keys\n\
424 --random-source=FILE get random bytes from FILE\n\
425 -r, --reverse reverse the result of comparisons\n\
428 --sort=WORD sort according to WORD:\n\
429 general-numeric -g, human-numeric -h, month -M,\
431 numeric -n, random -R, version -V\n\
432 -V, --version-sort natural sort of (version) numbers within text\n\
440 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
441 for more use temp files\n\
444 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
445 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
447 --compress-program=PROG compress temporaries with PROG;\n\
448 decompress them with PROG -d\n\
451 --debug annotate the part of the line used to sort,\n\
452 and warn about questionable usage to stderr\n\
453 --files0-from=F read input from the files specified by\n\
454 NUL-terminated names in file F;\n\
455 If F is - then read names from standard input\n\
458 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
459 -m, --merge merge already sorted files; do not sort\n\
462 -o, --output=FILE write result to FILE instead of standard output\n\
463 -s, --stable stabilize sort by disabling last-resort comparison\
465 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
468 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
469 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
470 multiple options specify multiple directories\n\
471 --parallel=N change the number of sorts run concurrently to N\n\
472 -u, --unique with -c, check for strict ordering;\n\
473 without -c, output only the first of an equal run\
477 -z, --zero-terminated end lines with 0 byte, not newline\n\
479 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
480 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
483 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
484 field number and C a character position in the field; both are origin 1, and\n\
485 the stop position defaults to the line's end. If neither -t nor -b is in\n\
486 effect, characters in a field are counted from the beginning of the preceding\n\
487 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
489 which override global ordering options for that key. If no key is given, use\n\
490 the entire line as the key.\n\
492 SIZE may be followed by the following multiplicative suffixes:\n\
495 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
497 With no FILE, or when FILE is -, read standard input.\n\
500 The locale specified by the environment affects sort order.\n\
501 Set LC_ALL=C to get the traditional sort order that uses\n\
502 native byte values.\n\
504 emit_ancillary_info ();
510 /* For long options that have no equivalent short option, use a
511 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
514 CHECK_OPTION
= CHAR_MAX
+ 1,
515 COMPRESS_PROGRAM_OPTION
,
516 DEBUG_PROGRAM_OPTION
,
519 RANDOM_SOURCE_OPTION
,
524 static char const short_options
[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
526 static struct option
const long_options
[] =
528 {"ignore-leading-blanks", no_argument
, NULL
, 'b'},
529 {"check", optional_argument
, NULL
, CHECK_OPTION
},
530 {"compress-program", required_argument
, NULL
, COMPRESS_PROGRAM_OPTION
},
531 {"debug", no_argument
, NULL
, DEBUG_PROGRAM_OPTION
},
532 {"dictionary-order", no_argument
, NULL
, 'd'},
533 {"ignore-case", no_argument
, NULL
, 'f'},
534 {"files0-from", required_argument
, NULL
, FILES0_FROM_OPTION
},
535 {"general-numeric-sort", no_argument
, NULL
, 'g'},
536 {"ignore-nonprinting", no_argument
, NULL
, 'i'},
537 {"key", required_argument
, NULL
, 'k'},
538 {"merge", no_argument
, NULL
, 'm'},
539 {"month-sort", no_argument
, NULL
, 'M'},
540 {"numeric-sort", no_argument
, NULL
, 'n'},
541 {"human-numeric-sort", no_argument
, NULL
, 'h'},
542 {"version-sort", no_argument
, NULL
, 'V'},
543 {"random-sort", no_argument
, NULL
, 'R'},
544 {"random-source", required_argument
, NULL
, RANDOM_SOURCE_OPTION
},
545 {"sort", required_argument
, NULL
, SORT_OPTION
},
546 {"output", required_argument
, NULL
, 'o'},
547 {"reverse", no_argument
, NULL
, 'r'},
548 {"stable", no_argument
, NULL
, 's'},
549 {"batch-size", required_argument
, NULL
, NMERGE_OPTION
},
550 {"buffer-size", required_argument
, NULL
, 'S'},
551 {"field-separator", required_argument
, NULL
, 't'},
552 {"temporary-directory", required_argument
, NULL
, 'T'},
553 {"unique", no_argument
, NULL
, 'u'},
554 {"zero-terminated", no_argument
, NULL
, 'z'},
555 {"parallel", required_argument
, NULL
, PARALLEL_OPTION
},
556 {GETOPT_HELP_OPTION_DECL
},
557 {GETOPT_VERSION_OPTION_DECL
},
561 #define CHECK_TABLE \
563 _ct_("silent", 'C') \
564 _ct_("diagnose-first", 'c')
566 static char const *const check_args
[] =
568 #define _ct_(_s, _c) _s,
572 static char const check_types
[] =
574 #define _ct_(_s, _c) _c,
580 _st_("general-numeric", 'g') \
581 _st_("human-numeric", 'h') \
583 _st_("numeric", 'n') \
584 _st_("random", 'R') \
587 static char const *const sort_args
[] =
589 #define _st_(_s, _c) _s,
593 static char const sort_types
[] =
595 #define _st_(_s, _c) _c,
600 /* The set of signals that are caught. */
601 static sigset_t caught_signals
;
603 /* Critical section status. */
610 /* Enter a critical section. */
611 static struct cs_status
614 struct cs_status status
;
615 status
.valid
= (sigprocmask (SIG_BLOCK
, &caught_signals
, &status
.sigs
) == 0);
619 /* Leave a critical section. */
621 cs_leave (struct cs_status status
)
625 /* Ignore failure when restoring the signal mask. */
626 sigprocmask (SIG_SETMASK
, &status
.sigs
, NULL
);
630 /* Possible states for a temp file. If compressed, the file's status
631 is unreaped or reaped, depending on whether 'sort' has waited for
632 the subprocess to finish. */
633 enum { UNCOMPRESSED
, UNREAPED
, REAPED
};
635 /* The list of temporary files. */
638 struct tempnode
*volatile next
;
639 pid_t pid
; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
641 char name
[1]; /* Actual size is 1 + file name length. */
643 static struct tempnode
*volatile temphead
;
644 static struct tempnode
*volatile *temptail
= &temphead
;
646 /* A file to be sorted. */
649 /* The file's name. */
652 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
653 struct tempnode
*temp
;
656 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
657 static Hash_table
*proctab
;
659 enum { INIT_PROCTAB_SIZE
= 47 };
662 proctab_hasher (void const *entry
, size_t tabsize
)
664 struct tempnode
const *node
= entry
;
665 return node
->pid
% tabsize
;
669 proctab_comparator (void const *e1
, void const *e2
)
671 struct tempnode
const *n1
= e1
;
672 struct tempnode
const *n2
= e2
;
673 return n1
->pid
== n2
->pid
;
676 /* The number of unreaped child processes. */
679 static bool delete_proc (pid_t
);
681 /* If PID is positive, wait for the child process with that PID to
682 exit, and assume that PID has already been removed from the process
683 table. If PID is 0 or -1, clean up some child that has exited (by
684 waiting for it, and removing it from the proc table) and return the
685 child's process ID. However, if PID is 0 and no children have
686 exited, return 0 without waiting. */
692 pid_t cpid
= waitpid ((pid
? pid
: -1), &status
, (pid
? 0 : WNOHANG
));
695 error (SORT_FAILURE
, errno
, _("waiting for %s [-d]"),
697 else if (0 < cpid
&& (0 < pid
|| delete_proc (cpid
)))
699 if (! WIFEXITED (status
) || WEXITSTATUS (status
))
700 error (SORT_FAILURE
, 0, _("%s [-d] terminated abnormally"),
708 /* TEMP represents a new process; add it to the process table. Create
709 the process table the first time it's called. */
712 register_proc (struct tempnode
*temp
)
716 proctab
= hash_initialize (INIT_PROCTAB_SIZE
, NULL
,
724 temp
->state
= UNREAPED
;
726 if (! hash_insert (proctab
, temp
))
730 /* If PID is in the process table, remove it and return true.
731 Otherwise, return false. */
734 delete_proc (pid_t pid
)
736 struct tempnode test
;
739 struct tempnode
*node
= hash_delete (proctab
, &test
);
742 node
->state
= REAPED
;
746 /* Remove PID from the process table, and wait for it to exit if it
750 wait_proc (pid_t pid
)
752 if (delete_proc (pid
))
756 /* Reap any exited children. Do not block; reap only those that have
762 while (0 < nprocs
&& reap (0))
766 /* Reap at least one exited child, waiting if necessary. */
775 /* Reap all children, waiting if necessary. */
784 /* Clean up any remaining temporary files. */
789 struct tempnode
const *node
;
791 for (node
= temphead
; node
; node
= node
->next
)
796 /* Cleanup actions to take when exiting. */
803 /* Clean up any remaining temporary files in a critical section so
804 that a signal handler does not try to clean them too. */
805 struct cs_status cs
= cs_enter ();
813 /* Create a new temporary file, returning its newly allocated tempnode.
814 Store into *PFD the file descriptor open for writing.
815 If the creation fails, return NULL and store -1 into *PFD if the
816 failure is due to file descriptor exhaustion and
817 SURVIVE_FD_EXHAUSTION; otherwise, die. */
819 static struct tempnode
*
820 create_temp_file (int *pfd
, bool survive_fd_exhaustion
)
822 static char const slashbase
[] = "/sortXXXXXX";
823 static size_t temp_dir_index
;
826 char const *temp_dir
= temp_dirs
[temp_dir_index
];
827 size_t len
= strlen (temp_dir
);
828 struct tempnode
*node
=
829 xmalloc (offsetof (struct tempnode
, name
) + len
+ sizeof slashbase
);
830 char *file
= node
->name
;
833 memcpy (file
, temp_dir
, len
);
834 memcpy (file
+ len
, slashbase
, sizeof slashbase
);
836 if (++temp_dir_index
== temp_dir_count
)
839 /* Create the temporary file in a critical section, to avoid races. */
845 temptail
= &node
->next
;
853 if (! (survive_fd_exhaustion
&& errno
== EMFILE
))
854 error (SORT_FAILURE
, errno
, _("cannot create temporary file in %s"),
864 /* Return a stream for FILE, opened with mode HOW. A null FILE means
865 standard output; HOW should be "w". When opening for input, "-"
866 means standard input. To avoid confusion, do not return file
867 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
868 opening an ordinary FILE. Return NULL if unsuccessful.
870 fadvise() is used to specify an access pattern for input files.
871 There are a few hints we could possibly provide,
872 and after careful testing it was decided that
873 specifying POSIX_FADV_SEQUENTIAL was not detrimental
874 to any cases. On Linux 2.6.31, this option doubles
875 the size of read ahead performed and thus was seen to
878 Sorting with a smaller internal buffer
879 Reading from faster flash devices
881 In _addition_ one could also specify other hints...
883 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
884 at least uses that to _synchronously_ prepopulate the cache
885 with the specified range. While sort does need to
886 read all of its input before outputting, a synchronous
887 read of the whole file up front precludes any processing
888 that sort could do in parallel with the system doing
889 read ahead of the data. This was seen to have negative effects
890 in a couple of cases:
892 Sorting with a smaller internal buffer
893 Note this option was seen to shorten the runtime for sort
894 on a multicore system with lots of RAM and other processes
895 competing for CPU. It could be argued that more explicit
896 scheduling hints with 'nice' et. al. are more appropriate
899 POSIX_FADV_NOREUSE is a possibility as it could lower
900 the priority of input data in the cache as sort will
901 only need to process it once. However its functionality
902 has changed over Linux kernel versions and as of 2.6.31
903 it does nothing and thus we can't depend on what it might
906 POSIX_FADV_DONTNEED is not appropriate for user specified
907 input files, but for temp files we do want to drop the
908 cache immediately after processing. This is done implicitly
909 however when the files are unlinked. */
912 stream_open (char const *file
, char const *how
)
919 if (STREQ (file
, "-"))
921 have_read_stdin
= true;
925 fp
= fopen (file
, how
);
926 fadvise (fp
, FADVISE_SEQUENTIAL
);
929 return fopen (file
, how
);
932 /* Same as stream_open, except always return a non-null value; die on
936 xfopen (char const *file
, char const *how
)
938 FILE *fp
= stream_open (file
, how
);
940 die (_("open failed"), file
);
944 /* Close FP, whose name is FILE, and report any errors. */
947 xfclose (FILE *fp
, char const *file
)
952 /* Allow reading stdin from tty more than once. */
958 /* Don't close stdout just yet. close_stdout does that. */
959 if (fflush (fp
) != 0)
960 die (_("fflush failed"), file
);
964 if (fclose (fp
) != 0)
965 die (_("close failed"), file
);
971 dup2_or_die (int oldfd
, int newfd
)
973 if (dup2 (oldfd
, newfd
) < 0)
974 error (SORT_FAILURE
, errno
, _("dup2 failed"));
977 /* Fork a child process for piping to and do common cleanup. The
978 TRIES parameter tells us how many times to try to fork before
979 giving up. Return the PID of the child, or -1 (setting errno)
983 pipe_fork (int pipefds
[2], size_t tries
)
985 #if HAVE_WORKING_FORK
986 struct tempnode
*saved_temphead
;
988 double wait_retry
= 0.25;
989 pid_t pid
IF_LINT ( = -1);
992 if (pipe (pipefds
) < 0)
995 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
996 uncontrolled subprocess generation can hurt performance significantly.
997 Allow at most NMERGE + 2 subprocesses, on the theory that there
998 may be some useful parallelism by letting compression for the
999 previous merge finish (1 subprocess) in parallel with the current
1000 merge (NMERGE + 1 subprocesses). */
1002 if (nmerge
+ 1 < nprocs
)
1007 /* This is so the child process won't delete our temp files
1008 if it receives a signal before exec-ing. */
1010 saved_temphead
= temphead
;
1014 saved_errno
= errno
;
1016 temphead
= saved_temphead
;
1019 errno
= saved_errno
;
1021 if (0 <= pid
|| errno
!= EAGAIN
)
1025 xnanosleep (wait_retry
);
1033 saved_errno
= errno
;
1036 errno
= saved_errno
;
1040 close (STDIN_FILENO
);
1041 close (STDOUT_FILENO
);
1048 #else /* ! HAVE_WORKING_FORK */
1053 /* Create a temporary file and, if asked for, start a compressor
1054 to that file. Set *PFP to the file handle and return
1055 the address of the new temp node. If the creation
1056 fails, return NULL if the failure is due to file descriptor
1057 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1059 static struct tempnode
*
1060 maybe_create_temp (FILE **pfp
, bool survive_fd_exhaustion
)
1063 struct tempnode
*node
= create_temp_file (&tempfd
, survive_fd_exhaustion
);
1067 node
->state
= UNCOMPRESSED
;
1069 if (compress_program
)
1073 node
->pid
= pipe_fork (pipefds
, MAX_FORK_TRIES_COMPRESS
);
1078 tempfd
= pipefds
[1];
1080 register_proc (node
);
1082 else if (node
->pid
== 0)
1085 dup2_or_die (tempfd
, STDOUT_FILENO
);
1087 dup2_or_die (pipefds
[0], STDIN_FILENO
);
1090 if (execlp (compress_program
, compress_program
, (char *) NULL
) < 0)
1091 error (SORT_FAILURE
, errno
, _("couldn't execute %s"),
1096 *pfp
= fdopen (tempfd
, "w");
1098 die (_("couldn't create temporary file"), node
->name
);
1103 /* Create a temporary file and, if asked for, start a compressor
1104 to that file. Set *PFP to the file handle and return the address
1105 of the new temp node. Die on failure. */
1107 static struct tempnode
*
1108 create_temp (FILE **pfp
)
1110 return maybe_create_temp (pfp
, false);
1113 /* Open a compressed temp file and start a decompression process through
1114 which to filter the input. Return NULL (setting errno to
1115 EMFILE) if we ran out of file descriptors, and die on any other
1119 open_temp (struct tempnode
*temp
)
1121 int tempfd
, pipefds
[2];
1124 if (temp
->state
== UNREAPED
)
1125 wait_proc (temp
->pid
);
1127 tempfd
= open (temp
->name
, O_RDONLY
);
1131 pid_t child
= pipe_fork (pipefds
, MAX_FORK_TRIES_DECOMPRESS
);
1136 if (errno
!= EMFILE
)
1137 error (SORT_FAILURE
, errno
, _("couldn't create process for %s -d"),
1145 dup2_or_die (tempfd
, STDIN_FILENO
);
1147 dup2_or_die (pipefds
[1], STDOUT_FILENO
);
1150 execlp (compress_program
, compress_program
, "-d", (char *) NULL
);
1151 error (SORT_FAILURE
, errno
, _("couldn't execute %s -d"),
1156 register_proc (temp
);
1160 fp
= fdopen (pipefds
[0], "r");
1163 int saved_errno
= errno
;
1165 errno
= saved_errno
;
1173 /* Append DIR to the array of temporary directory names. */
1175 add_temp_dir (char const *dir
)
1177 if (temp_dir_count
== temp_dir_alloc
)
1178 temp_dirs
= X2NREALLOC (temp_dirs
, &temp_dir_alloc
);
1180 temp_dirs
[temp_dir_count
++] = dir
;
1183 /* Remove NAME from the list of temporary files. */
1186 zaptemp (char const *name
)
1188 struct tempnode
*volatile *pnode
;
1189 struct tempnode
*node
;
1190 struct tempnode
*next
;
1192 int unlink_errno
= 0;
1193 struct cs_status cs
;
1195 for (pnode
= &temphead
; (node
= *pnode
)->name
!= name
; pnode
= &node
->next
)
1198 if (node
->state
== UNREAPED
)
1199 wait_proc (node
->pid
);
1201 /* Unlink the temporary file in a critical section to avoid races. */
1204 unlink_status
= unlink (name
);
1205 unlink_errno
= errno
;
1209 if (unlink_status
!= 0)
1210 error (0, unlink_errno
, _("warning: cannot remove: %s"), name
);
1216 #if HAVE_NL_LANGINFO
1219 struct_month_cmp (void const *m1
, void const *m2
)
1221 struct month
const *month1
= m1
;
1222 struct month
const *month2
= m2
;
1223 return strcmp (month1
->name
, month2
->name
);
1228 /* Initialize the character class tables. */
1235 for (i
= 0; i
< UCHAR_LIM
; ++i
)
1237 blanks
[i
] = !! isblank (i
);
1238 nonprinting
[i
] = ! isprint (i
);
1239 nondictionary
[i
] = ! isalnum (i
) && ! isblank (i
);
1240 fold_toupper
[i
] = toupper (i
);
1243 #if HAVE_NL_LANGINFO
1244 /* If we're not in the "C" locale, read different names for months. */
1247 for (i
= 0; i
< MONTHS_PER_YEAR
; i
++)
1254 s
= nl_langinfo (ABMON_1
+ i
);
1256 monthtab
[i
].name
= name
= xmalloc (s_len
+ 1);
1257 monthtab
[i
].val
= i
+ 1;
1259 for (j
= k
= 0; j
< s_len
; j
++)
1260 if (! isblank (to_uchar (s
[j
])))
1261 name
[k
++] = fold_toupper
[to_uchar (s
[j
])];
1264 qsort (monthtab
, MONTHS_PER_YEAR
, sizeof *monthtab
, struct_month_cmp
);
1269 /* Specify how many inputs may be merged at once.
1270 This may be set on the command-line with the
1271 --batch-size option. */
1273 specify_nmerge (int oi
, char c
, char const *s
)
1276 struct rlimit rlimit
;
1277 enum strtol_error e
= xstrtoumax (s
, NULL
, 10, &n
, NULL
);
1279 /* Try to find out how many file descriptors we'll be able
1280 to open. We need at least nmerge + 3 (STDIN_FILENO,
1281 STDOUT_FILENO and STDERR_FILENO). */
1282 unsigned int max_nmerge
= ((getrlimit (RLIMIT_NOFILE
, &rlimit
) == 0
1287 if (e
== LONGINT_OK
)
1291 e
= LONGINT_OVERFLOW
;
1296 error (0, 0, _("invalid --%s argument %s"),
1297 long_options
[oi
].name
, quote (s
));
1298 error (SORT_FAILURE
, 0,
1299 _("minimum --%s argument is %s"),
1300 long_options
[oi
].name
, quote ("2"));
1302 else if (max_nmerge
< nmerge
)
1304 e
= LONGINT_OVERFLOW
;
1311 if (e
== LONGINT_OVERFLOW
)
1313 char max_nmerge_buf
[INT_BUFSIZE_BOUND (max_nmerge
)];
1314 error (0, 0, _("--%s argument %s too large"),
1315 long_options
[oi
].name
, quote (s
));
1316 error (SORT_FAILURE
, 0,
1317 _("maximum --%s argument with current rlimit is %s"),
1318 long_options
[oi
].name
,
1319 uinttostr (max_nmerge
, max_nmerge_buf
));
1322 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1325 /* Specify the amount of main memory to use when sorting. */
1327 specify_sort_size (int oi
, char c
, char const *s
)
1331 enum strtol_error e
= xstrtoumax (s
, &suffix
, 10, &n
, "EgGkKmMPtTYZ");
1333 /* The default unit is KiB. */
1334 if (e
== LONGINT_OK
&& ISDIGIT (suffix
[-1]))
1336 if (n
<= UINTMAX_MAX
/ 1024)
1339 e
= LONGINT_OVERFLOW
;
1342 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1343 if (e
== LONGINT_INVALID_SUFFIX_CHAR
&& ISDIGIT (suffix
[-1]) && ! suffix
[1])
1352 double mem
= physmem_total () * n
/ 100;
1354 /* Use "<", not "<=", to avoid problems with rounding. */
1355 if (mem
< UINTMAX_MAX
)
1361 e
= LONGINT_OVERFLOW
;
1366 if (e
== LONGINT_OK
)
1368 /* If multiple sort sizes are specified, take the maximum, so
1369 that option order does not matter. */
1376 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
1380 e
= LONGINT_OVERFLOW
;
1383 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1386 /* Specify the number of threads to spawn during internal sort. */
1388 specify_nthreads (int oi
, char c
, char const *s
)
1390 unsigned long int nthreads
;
1391 enum strtol_error e
= xstrtoul (s
, NULL
, 10, &nthreads
, "");
1392 if (e
== LONGINT_OVERFLOW
)
1394 if (e
!= LONGINT_OK
)
1395 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1396 if (SIZE_MAX
< nthreads
)
1397 nthreads
= SIZE_MAX
;
1399 error (SORT_FAILURE
, 0, _("number in parallel must be nonzero"));
1403 /* Return the default sort size. */
1405 default_sort_size (void)
1407 /* Let SIZE be MEM, but no more than the maximum object size or
1408 system resource limits. Don't bother to check for values like
1409 RLIM_INFINITY since in practice they are not much less than SIZE_MAX. */
1410 size_t size
= SIZE_MAX
;
1411 struct rlimit rlimit
;
1412 if (getrlimit (RLIMIT_DATA
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1413 size
= rlimit
.rlim_cur
;
1415 if (getrlimit (RLIMIT_AS
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1416 size
= rlimit
.rlim_cur
;
1419 /* Leave a large safety margin for the above limits, as failure can
1420 occur when they are exceeded. */
1424 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1425 Exceeding RSS is not fatal, but can be quite slow. */
1426 if (getrlimit (RLIMIT_RSS
, &rlimit
) == 0 && rlimit
.rlim_cur
/ 16 * 15 < size
)
1427 size
= rlimit
.rlim_cur
/ 16 * 15;
1430 /* Let MEM be available memory or 1/8 of total memory, whichever
1432 double avail
= physmem_available ();
1433 double total
= physmem_total ();
1434 double mem
= MAX (avail
, total
/ 8);
1436 /* Return the minimum of MEM and SIZE, but no less than
1437 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1438 right when only one argument is floating point. */
1441 return MAX (size
, MIN_SORT_SIZE
);
1444 /* Return the sort buffer size to use with the input files identified
1445 by FPS and FILES, which are alternate names of the same files.
1446 NFILES gives the number of input files; NFPS may be less. Assume
1447 that each input line requires LINE_BYTES extra bytes' worth of line
1448 information. Do not exceed the size bound specified by the user
1449 (or a default size bound, if the user does not specify one). */
1452 sort_buffer_size (FILE *const *fps
, size_t nfps
,
1453 char *const *files
, size_t nfiles
,
1456 /* A bound on the input size. If zero, the bound hasn't been
1458 static size_t size_bound
;
1460 /* In the worst case, each input byte is a newline. */
1461 size_t worst_case_per_input_byte
= line_bytes
+ 1;
1463 /* Keep enough room for one extra input line and an extra byte.
1464 This extra room might be needed when preparing to read EOF. */
1465 size_t size
= worst_case_per_input_byte
+ 1;
1469 for (i
= 0; i
< nfiles
; i
++)
1475 if ((i
< nfps
? fstat (fileno (fps
[i
]), &st
)
1476 : STREQ (files
[i
], "-") ? fstat (STDIN_FILENO
, &st
)
1477 : stat (files
[i
], &st
))
1479 die (_("stat failed"), files
[i
]);
1481 if (S_ISREG (st
.st_mode
))
1482 file_size
= st
.st_size
;
1485 /* The file has unknown size. If the user specified a sort
1486 buffer size, use that; otherwise, guess the size. */
1489 file_size
= INPUT_FILE_SIZE_GUESS
;
1494 size_bound
= sort_size
;
1496 size_bound
= default_sort_size ();
1499 /* Add the amount of memory needed to represent the worst case
1500 where the input consists entirely of newlines followed by a
1501 single non-newline. Check for overflow. */
1502 worst_case
= file_size
* worst_case_per_input_byte
+ 1;
1503 if (file_size
!= worst_case
/ worst_case_per_input_byte
1504 || size_bound
- size
<= worst_case
)
1512 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1513 must be at least sizeof (struct line). Allocate ALLOC bytes
1517 initbuf (struct buffer
*buf
, size_t line_bytes
, size_t alloc
)
1519 /* Ensure that the line array is properly aligned. If the desired
1520 size cannot be allocated, repeatedly halve it until allocation
1521 succeeds. The smaller allocation may hurt overall performance,
1522 but that's better than failing. */
1525 alloc
+= sizeof (struct line
) - alloc
% sizeof (struct line
);
1526 buf
->buf
= malloc (alloc
);
1530 if (alloc
<= line_bytes
+ 1)
1534 buf
->line_bytes
= line_bytes
;
1536 buf
->used
= buf
->left
= buf
->nlines
= 0;
1540 /* Return one past the limit of the line array. */
1542 static inline struct line
*
1543 buffer_linelim (struct buffer
const *buf
)
1545 return (struct line
*) (buf
->buf
+ buf
->alloc
);
1548 /* Return a pointer to the first character of the field specified
1552 begfield (struct line
const *line
, struct keyfield
const *key
)
1554 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1555 size_t sword
= key
->sword
;
1556 size_t schar
= key
->schar
;
1558 /* The leading field separator itself is included in a field when -t
1561 if (tab
!= TAB_DEFAULT
)
1562 while (ptr
< lim
&& sword
--)
1564 while (ptr
< lim
&& *ptr
!= tab
)
1570 while (ptr
< lim
&& sword
--)
1572 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1574 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1578 /* If we're ignoring leading blanks when computing the Start
1579 of the field, skip past them here. */
1580 if (key
->skipsblanks
)
1581 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1584 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1585 ptr
= MIN (lim
, ptr
+ schar
);
1590 /* Return the limit of (a pointer to the first character after) the field
1591 in LINE specified by KEY. */
1594 limfield (struct line
const *line
, struct keyfield
const *key
)
1596 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1597 size_t eword
= key
->eword
, echar
= key
->echar
;
1600 eword
++; /* Skip all of end field. */
1602 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1603 whichever comes first. If there are more than EWORD fields, leave
1604 PTR pointing at the beginning of the field having zero-based index,
1605 EWORD. If a delimiter character was specified (via -t), then that
1606 'beginning' is the first character following the delimiting TAB.
1607 Otherwise, leave PTR pointing at the first 'blank' character after
1608 the preceding field. */
1609 if (tab
!= TAB_DEFAULT
)
1610 while (ptr
< lim
&& eword
--)
1612 while (ptr
< lim
&& *ptr
!= tab
)
1614 if (ptr
< lim
&& (eword
|| echar
))
1618 while (ptr
< lim
&& eword
--)
1620 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1622 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1626 #ifdef POSIX_UNSPECIFIED
1627 /* The following block of code makes GNU sort incompatible with
1628 standard Unix sort, so it's ifdef'd out for now.
1629 The POSIX spec isn't clear on how to interpret this.
1630 FIXME: request clarification.
1632 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1633 Date: Thu, 30 May 96 12:20:41 -0400
1634 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1636 [...]I believe I've found another bug in 'sort'.
1641 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1644 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1648 Unix sort produced the answer I expected: sort on the single character
1649 in column 7. GNU sort produced different results, because it disagrees
1650 on the interpretation of the key-end spec "M.N". Unix sort reads this
1651 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1652 "skip M-1 fields, then either N-1 characters or the rest of the current
1653 field, whichever comes first". This extra clause applies only to
1654 key-ends, not key-starts.
1657 /* Make LIM point to the end of (one byte past) the current field. */
1658 if (tab
!= TAB_DEFAULT
)
1661 newlim
= memchr (ptr
, tab
, lim
- ptr
);
1669 while (newlim
< lim
&& blanks
[to_uchar (*newlim
)])
1671 while (newlim
< lim
&& !blanks
[to_uchar (*newlim
)])
1677 if (echar
!= 0) /* We need to skip over a portion of the end field. */
1679 /* If we're ignoring leading blanks when computing the End
1680 of the field, skip past them here. */
1681 if (key
->skipeblanks
)
1682 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1685 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1686 ptr
= MIN (lim
, ptr
+ echar
);
1692 /* Fill BUF reading from FP, moving buf->left bytes from the end
1693 of buf->buf to the beginning first. If EOF is reached and the
1694 file wasn't terminated by a newline, supply one. Set up BUF's line
1695 table too. FILE is the name of the file corresponding to FP.
1696 Return true if some input was read. */
1699 fillbuf (struct buffer
*buf
, FILE *fp
, char const *file
)
1701 struct keyfield
const *key
= keylist
;
1703 size_t line_bytes
= buf
->line_bytes
;
1704 size_t mergesize
= merge_buffer_size
- MIN_MERGE_BUFFER_SIZE
;
1709 if (buf
->used
!= buf
->left
)
1711 memmove (buf
->buf
, buf
->buf
+ buf
->used
- buf
->left
, buf
->left
);
1712 buf
->used
= buf
->left
;
1718 char *ptr
= buf
->buf
+ buf
->used
;
1719 struct line
*linelim
= buffer_linelim (buf
);
1720 struct line
*line
= linelim
- buf
->nlines
;
1721 size_t avail
= (char *) linelim
- buf
->nlines
* line_bytes
- ptr
;
1722 char *line_start
= buf
->nlines
? line
->text
+ line
->length
: buf
->buf
;
1724 while (line_bytes
+ 1 < avail
)
1726 /* Read as many bytes as possible, but do not read so many
1727 bytes that there might not be enough room for the
1728 corresponding line array. The worst case is when the
1729 rest of the input file consists entirely of newlines,
1730 except that the last byte is not a newline. */
1731 size_t readsize
= (avail
- 1) / (line_bytes
+ 1);
1732 size_t bytes_read
= fread (ptr
, 1, readsize
, fp
);
1733 char *ptrlim
= ptr
+ bytes_read
;
1735 avail
-= bytes_read
;
1737 if (bytes_read
!= readsize
)
1740 die (_("read failed"), file
);
1744 if (buf
->buf
== ptrlim
)
1746 if (line_start
!= ptrlim
&& ptrlim
[-1] != eol
)
1751 /* Find and record each line in the just-read input. */
1752 while ((p
= memchr (ptr
, eol
, ptrlim
- ptr
)))
1754 /* Delimit the line with NUL. This eliminates the need to
1755 temporarily replace the last byte with NUL when calling
1756 xmemcoll(), which increases performance. */
1760 line
->text
= line_start
;
1761 line
->length
= ptr
- line_start
;
1762 mergesize
= MAX (mergesize
, line
->length
);
1763 avail
-= line_bytes
;
1767 /* Precompute the position of the first key for
1769 line
->keylim
= (key
->eword
== SIZE_MAX
1771 : limfield (line
, key
));
1773 if (key
->sword
!= SIZE_MAX
)
1774 line
->keybeg
= begfield (line
, key
);
1777 if (key
->skipsblanks
)
1778 while (blanks
[to_uchar (*line_start
)])
1780 line
->keybeg
= line_start
;
1792 buf
->used
= ptr
- buf
->buf
;
1793 buf
->nlines
= buffer_linelim (buf
) - line
;
1794 if (buf
->nlines
!= 0)
1796 buf
->left
= ptr
- line_start
;
1797 merge_buffer_size
= mergesize
+ MIN_MERGE_BUFFER_SIZE
;
1802 /* The current input line is too long to fit in the buffer.
1803 Double the buffer size and try again, keeping it properly
1805 size_t line_alloc
= buf
->alloc
/ sizeof (struct line
);
1806 buf
->buf
= x2nrealloc (buf
->buf
, &line_alloc
, sizeof (struct line
));
1807 buf
->alloc
= line_alloc
* sizeof (struct line
);
1812 /* Table that maps characters to order-of-magnitude values. */
1813 static char const unit_order
[UCHAR_LIM
] =
1815 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1816 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1817 /* This initializer syntax works on all C99 hosts. For now, use
1818 it only on non-ASCII hosts, to ease the pain of porting to
1819 pre-C99 ASCII hosts. */
1820 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1823 /* Generate the following table with this command:
1824 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1825 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1830 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1833 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1841 /* Return an integer that represents the order of magnitude of the
1842 unit following the number. The number may contain thousands
1843 separators and a decimal point, but it may not contain leading blanks.
1844 Negative numbers get negative orders; zero numbers have a zero order. */
1846 static int _GL_ATTRIBUTE_PURE
1847 find_unit_order (char const *number
)
1849 bool minus_sign
= (*number
== '-');
1850 char const *p
= number
+ minus_sign
;
1854 /* Scan to end of number.
1855 Decimals or separators not followed by digits stop the scan.
1856 Numbers ending in decimals or separators are thus considered
1857 to be lacking in units.
1858 FIXME: add support for multibyte thousands_sep and decimal_point. */
1862 while (ISDIGIT (ch
= *p
++))
1863 nonzero
|= ch
- '0';
1865 while (ch
== thousands_sep
);
1867 if (ch
== decimal_point
)
1868 while (ISDIGIT (ch
= *p
++))
1869 nonzero
|= ch
- '0';
1873 int order
= unit_order
[ch
];
1874 return (minus_sign
? -order
: order
);
1880 /* Compare numbers A and B ending in units with SI or IEC prefixes
1881 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1884 human_numcompare (char const *a
, char const *b
)
1886 while (blanks
[to_uchar (*a
)])
1888 while (blanks
[to_uchar (*b
)])
1891 int diff
= find_unit_order (a
) - find_unit_order (b
);
1892 return (diff
? diff
: strnumcmp (a
, b
, decimal_point
, thousands_sep
));
1895 /* Compare strings A and B as numbers without explicitly converting them to
1896 machine numbers. Comparatively slow for short strings, but asymptotically
1900 numcompare (char const *a
, char const *b
)
1902 while (blanks
[to_uchar (*a
)])
1904 while (blanks
[to_uchar (*b
)])
1907 return strnumcmp (a
, b
, decimal_point
, thousands_sep
);
1910 /* Work around a problem whereby the long double value returned by glibc's
1911 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1912 A and B before calling strtold. FIXME: remove this function once
1913 gnulib guarantees that strtold's result is always well defined. */
1915 nan_compare (char const *sa
, char const *sb
)
1918 memset (&a
, 0, sizeof a
);
1919 a
= strtold (sa
, NULL
);
1922 memset (&b
, 0, sizeof b
);
1923 b
= strtold (sb
, NULL
);
1925 return memcmp (&a
, &b
, sizeof a
);
1929 general_numcompare (char const *sa
, char const *sb
)
1931 /* FIXME: maybe add option to try expensive FP conversion
1932 only if A and B can't be compared more cheaply/accurately. */
1936 long_double a
= strtold (sa
, &ea
);
1937 long_double b
= strtold (sb
, &eb
);
1939 /* Put conversion errors at the start of the collating sequence. */
1941 return sb
== eb
? 0 : -1;
1945 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1946 conversion errors but before numbers; sort them by internal
1947 bit-pattern, for lack of a more portable alternative. */
1953 : nan_compare (sa
, sb
));
1956 /* Return an integer in 1..12 of the month name MONTH.
1957 Return 0 if the name in S is not recognized. */
1960 getmonth (char const *month
, char **ea
)
1963 size_t hi
= MONTHS_PER_YEAR
;
1965 while (blanks
[to_uchar (*month
)])
1970 size_t ix
= (lo
+ hi
) / 2;
1971 char const *m
= month
;
1972 char const *n
= monthtab
[ix
].name
;
1980 return monthtab
[ix
].val
;
1982 if (to_uchar (fold_toupper
[to_uchar (*m
)]) < to_uchar (*n
))
1987 else if (to_uchar (fold_toupper
[to_uchar (*m
)]) > to_uchar (*n
))
1999 /* A randomly chosen MD5 state, used for random comparison. */
2000 static struct md5_ctx random_md5_state
;
2002 /* Initialize the randomly chosen MD5 state. */
2005 random_md5_state_init (char const *random_source
)
2007 unsigned char buf
[MD5_DIGEST_SIZE
];
2008 struct randread_source
*r
= randread_new (random_source
, sizeof buf
);
2010 die (_("open failed"), random_source
);
2011 randread (r
, buf
, sizeof buf
);
2012 if (randread_free (r
) != 0)
2013 die (_("close failed"), random_source
);
2014 md5_init_ctx (&random_md5_state
);
2015 md5_process_bytes (buf
, sizeof buf
, &random_md5_state
);
2018 /* This is like strxfrm, except it reports any error and exits. */
2021 xstrxfrm (char *restrict dest
, char const *restrict src
, size_t destsize
)
2024 size_t translated_size
= strxfrm (dest
, src
, destsize
);
2028 error (0, errno
, _("string transformation failed"));
2029 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2030 error (SORT_FAILURE
, 0,
2031 _("the untransformed string was %s"),
2032 quotearg_n_style (0, locale_quoting_style
, src
));
2035 return translated_size
;
2038 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2039 using one or more random hash functions. TEXTA[LENA] and
2040 TEXTB[LENB] must be zero. */
2043 compare_random (char *restrict texta
, size_t lena
,
2044 char *restrict textb
, size_t lenb
)
2046 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2047 data. This is used to break ties if there is a checksum
2048 collision, and this is good enough given the astronomically low
2049 probability of a collision. */
2052 char stackbuf
[4000];
2053 char *buf
= stackbuf
;
2054 size_t bufsize
= sizeof stackbuf
;
2055 void *allocated
= NULL
;
2056 uint32_t dig
[2][MD5_DIGEST_SIZE
/ sizeof (uint32_t)];
2057 struct md5_ctx s
[2];
2058 s
[0] = s
[1] = random_md5_state
;
2060 if (hard_LC_COLLATE
)
2062 char const *lima
= texta
+ lena
;
2063 char const *limb
= textb
+ lenb
;
2067 /* Transform the text into the basis of comparison, so that byte
2068 strings that would otherwise considered to be equal are
2069 considered equal here even if their bytes differ.
2071 Each time through this loop, transform one
2072 null-terminated string's worth from TEXTA or from TEXTB
2073 or both. That way, there's no need to store the
2074 transformation of the whole line, if it contains many
2075 null-terminated strings. */
2077 /* Store the transformed data into a big-enough buffer. */
2079 /* A 3X size guess avoids the overhead of calling strxfrm
2080 twice on typical implementations. Don't worry about
2081 size_t overflow, as the guess need not be correct. */
2082 size_t guess_bufsize
= 3 * (lena
+ lenb
) + 2;
2083 if (bufsize
< guess_bufsize
)
2085 bufsize
= MAX (guess_bufsize
, bufsize
* 3 / 2);
2087 buf
= allocated
= malloc (bufsize
);
2091 bufsize
= sizeof stackbuf
;
2096 (texta
< lima
? xstrxfrm (buf
, texta
, bufsize
) + 1 : 0);
2097 bool a_fits
= sizea
<= bufsize
;
2100 ? (xstrxfrm ((a_fits
? buf
+ sizea
: NULL
), textb
,
2101 (a_fits
? bufsize
- sizea
: 0))
2105 if (! (a_fits
&& sizea
+ sizeb
<= bufsize
))
2107 bufsize
= sizea
+ sizeb
;
2108 if (bufsize
< SIZE_MAX
/ 3)
2109 bufsize
= bufsize
* 3 / 2;
2111 buf
= allocated
= xmalloc (bufsize
);
2113 strxfrm (buf
, texta
, sizea
);
2115 strxfrm (buf
+ sizea
, textb
, sizeb
);
2118 /* Advance past NULs to the next part of each input string,
2119 exiting the loop if both strings are exhausted. When
2120 exiting the loop, prepare to finish off the tiebreaker
2121 comparison properly. */
2123 texta
+= strlen (texta
) + 1;
2125 textb
+= strlen (textb
) + 1;
2126 if (! (texta
< lima
|| textb
< limb
))
2128 lena
= sizea
; texta
= buf
;
2129 lenb
= sizeb
; textb
= buf
+ sizea
;
2133 /* Accumulate the transformed data in the corresponding
2135 md5_process_bytes (buf
, sizea
, &s
[0]);
2136 md5_process_bytes (buf
+ sizea
, sizeb
, &s
[1]);
2138 /* Update the tiebreaker comparison of the transformed data. */
2141 xfrm_diff
= memcmp (buf
, buf
+ sizea
, MIN (sizea
, sizeb
));
2143 xfrm_diff
= (sizea
> sizeb
) - (sizea
< sizeb
);
2148 /* Compute and compare the checksums. */
2149 md5_process_bytes (texta
, lena
, &s
[0]); md5_finish_ctx (&s
[0], dig
[0]);
2150 md5_process_bytes (textb
, lenb
, &s
[1]); md5_finish_ctx (&s
[1], dig
[1]);
2151 int diff
= memcmp (dig
[0], dig
[1], sizeof dig
[0]);
2153 /* Fall back on the tiebreaker if the checksums collide. */
2158 xfrm_diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
2160 xfrm_diff
= (lena
> lenb
) - (lena
< lenb
);
2171 /* Return the printable width of the block of memory starting at
2172 TEXT and ending just before LIM, counting each tab as one byte.
2173 FIXME: Should we generally be counting non printable chars? */
2176 debug_width (char const *text
, char const *lim
)
2178 size_t width
= mbsnwidth (text
, lim
- text
, 0);
2180 width
+= (*text
++ == '\t');
2184 /* For debug mode, "underline" a key at the
2185 specified offset and screen width. */
2188 mark_key (size_t offset
, size_t width
)
2194 printf (_("^ no match for key\n"));
2205 /* Return true if KEY is a numeric key. */
2208 key_numeric (struct keyfield
const *key
)
2210 return key
->numeric
|| key
->general_numeric
|| key
->human_numeric
;
2213 /* For LINE, output a debugging line that underlines KEY in LINE.
2214 If KEY is null, underline the whole line. */
2217 debug_key (struct line
const *line
, struct keyfield
const *key
)
2219 char *text
= line
->text
;
2221 char *lim
= text
+ line
->length
- 1;
2225 if (key
->sword
!= SIZE_MAX
)
2226 beg
= begfield (line
, key
);
2227 if (key
->eword
!= SIZE_MAX
)
2228 lim
= limfield (line
, key
);
2230 if (key
->skipsblanks
|| key
->month
|| key_numeric (key
))
2235 while (blanks
[to_uchar (*beg
)])
2238 char *tighter_lim
= beg
;
2242 else if (key
->month
)
2243 getmonth (beg
, &tighter_lim
);
2244 else if (key
->general_numeric
)
2245 ignore_value (strtold (beg
, &tighter_lim
));
2246 else if (key
->numeric
|| key
->human_numeric
)
2248 char *p
= beg
+ (beg
< lim
&& *beg
== '-');
2249 bool found_digit
= false;
2254 while (ISDIGIT (ch
= *p
++))
2257 while (ch
== thousands_sep
);
2259 if (ch
== decimal_point
)
2260 while (ISDIGIT (ch
= *p
++))
2264 tighter_lim
= p
- ! (key
->human_numeric
&& unit_order
[ch
]);
2274 size_t offset
= debug_width (text
, beg
);
2275 size_t width
= debug_width (beg
, lim
);
2276 mark_key (offset
, width
);
2279 /* Debug LINE by underlining its keys. */
2282 debug_line (struct line
const *line
)
2284 struct keyfield
const *key
= keylist
;
2287 debug_key (line
, key
);
2288 while (key
&& ((key
= key
->next
) || ! (unique
|| stable
)));
2291 /* Return whether sorting options specified for key. */
2294 default_key_compare (struct keyfield
const *key
)
2296 return ! (key
->ignore
2300 || key_numeric (key
)
2304 /* || key->reverse */
2308 /* Convert a key to the short options used to specify it. */
2311 key_to_opts (struct keyfield
const *key
, char *opts
)
2313 if (key
->skipsblanks
|| key
->skipeblanks
)
2314 *opts
++ = 'b';/* either disables global -b */
2315 if (key
->ignore
== nondictionary
)
2319 if (key
->general_numeric
)
2321 if (key
->human_numeric
)
2323 if (key
->ignore
== nonprinting
)
2338 /* Output data independent key warnings to stderr. */
2341 key_warnings (struct keyfield
const *gkey
, bool gkey_only
)
2343 struct keyfield
const *key
;
2344 struct keyfield ugkey
= *gkey
;
2345 unsigned long keynum
= 1;
2347 for (key
= keylist
; key
; key
= key
->next
, keynum
++)
2349 if (key
->obsolete_used
)
2351 size_t sword
= key
->sword
;
2352 size_t eword
= key
->eword
;
2353 char tmp
[INT_BUFSIZE_BOUND (uintmax_t)];
2354 /* obsolescent syntax +A.x -B.y is equivalent to:
2355 -k A+1.x+1,B.y (when y = 0)
2356 -k A+1.x+1,B+1.y (when y > 0) */
2357 char obuf
[INT_BUFSIZE_BOUND (sword
) * 2 + 4]; /* +# -# */
2358 char nbuf
[INT_BUFSIZE_BOUND (sword
) * 2 + 5]; /* -k #,# */
2362 if (sword
== SIZE_MAX
)
2365 po
= stpcpy (stpcpy (po
, "+"), umaxtostr (sword
, tmp
));
2366 pn
= stpcpy (stpcpy (pn
, "-k "), umaxtostr (sword
+ 1, tmp
));
2367 if (key
->eword
!= SIZE_MAX
)
2369 stpcpy (stpcpy (po
, " -"), umaxtostr (eword
+ 1, tmp
));
2370 stpcpy (stpcpy (pn
, ","),
2371 umaxtostr (eword
+ 1
2372 + (key
->echar
== SIZE_MAX
), tmp
));
2374 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2375 quote_n (0, obuf
), quote_n (1, nbuf
));
2378 /* Warn about field specs that will never match. */
2379 if (key
->sword
!= SIZE_MAX
&& key
->eword
< key
->sword
)
2380 error (0, 0, _("key %lu has zero width and will be ignored"), keynum
);
2382 /* Warn about significant leading blanks. */
2383 bool implicit_skip
= key_numeric (key
) || key
->month
;
2384 bool maybe_space_aligned
= !hard_LC_COLLATE
&& default_key_compare (key
)
2385 && !(key
->schar
|| key
->echar
);
2386 bool line_offset
= key
->eword
== 0 && key
->echar
!= 0; /* -k1.x,1.y */
2387 if (!gkey_only
&& tab
== TAB_DEFAULT
&& !line_offset
2388 && ((!key
->skipsblanks
&& !(implicit_skip
|| maybe_space_aligned
))
2389 || (!key
->skipsblanks
&& key
->schar
)
2390 || (!key
->skipeblanks
&& key
->echar
)))
2391 error (0, 0, _("leading blanks are significant in key %lu; "
2392 "consider also specifying 'b'"), keynum
);
2394 /* Warn about numeric comparisons spanning fields,
2395 as field delimiters could be interpreted as part
2396 of the number (maybe only in other locales). */
2397 if (!gkey_only
&& key_numeric (key
))
2399 size_t sword
= key
->sword
+ 1;
2400 size_t eword
= key
->eword
+ 1;
2403 if (!eword
|| sword
< eword
)
2404 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2408 /* Flag global options not copied or specified in any key. */
2409 if (ugkey
.ignore
&& (ugkey
.ignore
== key
->ignore
))
2410 ugkey
.ignore
= NULL
;
2411 if (ugkey
.translate
&& (ugkey
.translate
== key
->translate
))
2412 ugkey
.translate
= NULL
;
2413 ugkey
.skipsblanks
&= !key
->skipsblanks
;
2414 ugkey
.skipeblanks
&= !key
->skipeblanks
;
2415 ugkey
.month
&= !key
->month
;
2416 ugkey
.numeric
&= !key
->numeric
;
2417 ugkey
.general_numeric
&= !key
->general_numeric
;
2418 ugkey
.human_numeric
&= !key
->human_numeric
;
2419 ugkey
.random
&= !key
->random
;
2420 ugkey
.version
&= !key
->version
;
2421 ugkey
.reverse
&= !key
->reverse
;
2424 /* Warn about ignored global options flagged above.
2425 Note if gkey is the only one in the list, all flags are cleared. */
2426 if (!default_key_compare (&ugkey
)
2427 || (ugkey
.reverse
&& (stable
|| unique
) && keylist
))
2429 bool ugkey_reverse
= ugkey
.reverse
;
2430 if (!(stable
|| unique
))
2431 ugkey
.reverse
= false;
2432 /* The following is too big, but guaranteed to be "big enough". */
2433 char opts
[sizeof short_options
];
2434 key_to_opts (&ugkey
, opts
);
2436 ngettext ("option '-%s' is ignored",
2437 "options '-%s' are ignored",
2438 select_plural (strlen (opts
))), opts
);
2439 ugkey
.reverse
= ugkey_reverse
;
2441 if (ugkey
.reverse
&& !(stable
|| unique
) && keylist
)
2442 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2445 /* Compare two lines A and B trying every key in sequence until there
2446 are no more keys or a difference is found. */
2449 keycompare (struct line
const *a
, struct line
const *b
)
2451 struct keyfield
*key
= keylist
;
2453 /* For the first iteration only, the key positions have been
2454 precomputed for us. */
2455 char *texta
= a
->keybeg
;
2456 char *textb
= b
->keybeg
;
2457 char *lima
= a
->keylim
;
2458 char *limb
= b
->keylim
;
2464 char const *translate
= key
->translate
;
2465 bool const *ignore
= key
->ignore
;
2467 /* Treat field ends before field starts as empty fields. */
2468 lima
= MAX (texta
, lima
);
2469 limb
= MAX (textb
, limb
);
2471 /* Find the lengths. */
2472 size_t lena
= lima
- texta
;
2473 size_t lenb
= limb
- textb
;
2475 if (hard_LC_COLLATE
|| key_numeric (key
)
2476 || key
->month
|| key
->random
|| key
->version
)
2483 char enda
IF_LINT (= 0);
2484 char endb
IF_LINT (= 0);
2485 void *allocated
IF_LINT (= NULL
);
2486 char stackbuf
[4000];
2488 if (ignore
|| translate
)
2490 /* Compute with copies of the keys, which are the result of
2491 translating or ignoring characters, and which need their
2496 /* Allocate space for copies. */
2497 size_t size
= lena
+ 1 + lenb
+ 1;
2498 if (size
<= sizeof stackbuf
)
2499 ta
= stackbuf
, allocated
= NULL
;
2501 ta
= allocated
= xmalloc (size
);
2504 /* Put into each copy a version of the key in which the
2505 requested characters are ignored or translated. */
2506 for (tlena
= i
= 0; i
< lena
; i
++)
2507 if (! (ignore
&& ignore
[to_uchar (texta
[i
])]))
2508 ta
[tlena
++] = (translate
2509 ? translate
[to_uchar (texta
[i
])]
2513 for (tlenb
= i
= 0; i
< lenb
; i
++)
2514 if (! (ignore
&& ignore
[to_uchar (textb
[i
])]))
2515 tb
[tlenb
++] = (translate
2516 ? translate
[to_uchar (textb
[i
])]
2522 /* Use the keys in-place, temporarily null-terminated. */
2523 ta
= texta
; tlena
= lena
; enda
= ta
[tlena
]; ta
[tlena
] = '\0';
2524 tb
= textb
; tlenb
= lenb
; endb
= tb
[tlenb
]; tb
[tlenb
] = '\0';
2528 diff
= numcompare (ta
, tb
);
2529 else if (key
->general_numeric
)
2530 diff
= general_numcompare (ta
, tb
);
2531 else if (key
->human_numeric
)
2532 diff
= human_numcompare (ta
, tb
);
2533 else if (key
->month
)
2534 diff
= getmonth (ta
, NULL
) - getmonth (tb
, NULL
);
2535 else if (key
->random
)
2536 diff
= compare_random (ta
, tlena
, tb
, tlenb
);
2537 else if (key
->version
)
2538 diff
= filevercmp (ta
, tb
);
2541 /* Locale-dependent string sorting. This is slower than
2542 C-locale sorting, which is implemented below. */
2544 diff
= - NONZERO (tlenb
);
2545 else if (tlenb
== 0)
2548 diff
= xmemcoll0 (ta
, tlena
+ 1, tb
, tlenb
+ 1);
2551 if (ignore
|| translate
)
2561 #define CMP_WITH_IGNORE(A, B) \
2566 while (texta < lima && ignore[to_uchar (*texta)]) \
2568 while (textb < limb && ignore[to_uchar (*textb)]) \
2570 if (! (texta < lima && textb < limb)) \
2572 diff = to_uchar (A) - to_uchar (B); \
2579 diff = (texta < lima) - (textb < limb); \
2584 CMP_WITH_IGNORE (translate
[to_uchar (*texta
)],
2585 translate
[to_uchar (*textb
)]);
2587 CMP_WITH_IGNORE (*texta
, *textb
);
2590 diff
= - NONZERO (lenb
);
2597 while (texta
< lima
&& textb
< limb
)
2599 diff
= (to_uchar (translate
[to_uchar (*texta
++)])
2600 - to_uchar (translate
[to_uchar (*textb
++)]));
2607 diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
2611 diff
= lena
< lenb
? -1 : lena
!= lenb
;
2621 /* Find the beginning and limit of the next field. */
2622 if (key
->eword
!= SIZE_MAX
)
2623 lima
= limfield (a
, key
), limb
= limfield (b
, key
);
2625 lima
= a
->text
+ a
->length
- 1, limb
= b
->text
+ b
->length
- 1;
2627 if (key
->sword
!= SIZE_MAX
)
2628 texta
= begfield (a
, key
), textb
= begfield (b
, key
);
2631 texta
= a
->text
, textb
= b
->text
;
2632 if (key
->skipsblanks
)
2634 while (texta
< lima
&& blanks
[to_uchar (*texta
)])
2636 while (textb
< limb
&& blanks
[to_uchar (*textb
)])
2647 return key
->reverse
? -diff
: diff
;
2650 /* Compare two lines A and B, returning negative, zero, or positive
2651 depending on whether A compares less than, equal to, or greater than B. */
2654 compare (struct line
const *a
, struct line
const *b
)
2659 /* First try to compare on the specified keys (if any).
2660 The only two cases with no key at all are unadorned sort,
2661 and unadorned sort -r. */
2664 diff
= keycompare (a
, b
);
2665 if (diff
|| unique
|| stable
)
2669 /* If the keys all compare equal (or no keys were specified)
2670 fall through to the default comparison. */
2671 alen
= a
->length
- 1, blen
= b
->length
- 1;
2674 diff
= - NONZERO (blen
);
2677 else if (hard_LC_COLLATE
)
2679 /* Note xmemcoll0 is a performance enhancement as
2680 it will not unconditionally write '\0' after the
2681 passed in buffers, which was seen to give around
2682 a 3% increase in performance for short lines. */
2683 diff
= xmemcoll0 (a
->text
, alen
+ 1, b
->text
, blen
+ 1);
2685 else if (! (diff
= memcmp (a
->text
, b
->text
, MIN (alen
, blen
))))
2686 diff
= alen
< blen
? -1 : alen
!= blen
;
2688 return reverse
? -diff
: diff
;
2691 /* Write LINE to output stream FP; the output file's name is
2692 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2693 otherwise. If debugging is enabled and FP is standard output,
2694 append some debugging information. */
2697 write_line (struct line
const *line
, FILE *fp
, char const *output_file
)
2699 char *buf
= line
->text
;
2700 size_t n_bytes
= line
->length
;
2701 char *ebuf
= buf
+ n_bytes
;
2703 if (!output_file
&& debug
)
2705 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2706 char const *c
= buf
;
2715 if (fputc (wc
, fp
) == EOF
)
2716 die (_("write failed"), output_file
);
2724 if (fwrite (buf
, 1, n_bytes
, fp
) != n_bytes
)
2725 die (_("write failed"), output_file
);
2730 /* Check that the lines read from FILE_NAME come in order. Return
2731 true if they are in order. If CHECKONLY == 'c', also print a
2732 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2733 they are not in order. */
2736 check (char const *file_name
, char checkonly
)
2738 FILE *fp
= xfopen (file_name
, "r");
2739 struct buffer buf
; /* Input buffer. */
2740 struct line temp
; /* Copy of previous line. */
2742 uintmax_t line_number
= 0;
2743 struct keyfield
const *key
= keylist
;
2744 bool nonunique
= ! unique
;
2745 bool ordered
= true;
2747 initbuf (&buf
, sizeof (struct line
),
2748 MAX (merge_buffer_size
, sort_size
));
2751 while (fillbuf (&buf
, fp
, file_name
))
2753 struct line
const *line
= buffer_linelim (&buf
);
2754 struct line
const *linebase
= line
- buf
.nlines
;
2756 /* Make sure the line saved from the old buffer contents is
2757 less than or equal to the first line of the new buffer. */
2758 if (alloc
&& nonunique
<= compare (&temp
, line
- 1))
2762 if (checkonly
== 'c')
2764 struct line
const *disorder_line
= line
- 1;
2765 uintmax_t disorder_line_number
=
2766 buffer_linelim (&buf
) - disorder_line
+ line_number
;
2767 char hr_buf
[INT_BUFSIZE_BOUND (disorder_line_number
)];
2768 fprintf (stderr
, _("%s: %s:%s: disorder: "),
2769 program_name
, file_name
,
2770 umaxtostr (disorder_line_number
, hr_buf
));
2771 write_line (disorder_line
, stderr
, _("standard error"));
2779 /* Compare each line in the buffer with its successor. */
2780 while (linebase
< --line
)
2781 if (nonunique
<= compare (line
, line
- 1))
2782 goto found_disorder
;
2784 line_number
+= buf
.nlines
;
2786 /* Save the last line of the buffer. */
2787 if (alloc
< line
->length
)
2794 alloc
= line
->length
;
2798 while (alloc
< line
->length
);
2801 temp
.text
= xmalloc (alloc
);
2803 memcpy (temp
.text
, line
->text
, line
->length
);
2804 temp
.length
= line
->length
;
2807 temp
.keybeg
= temp
.text
+ (line
->keybeg
- line
->text
);
2808 temp
.keylim
= temp
.text
+ (line
->keylim
- line
->text
);
2812 xfclose (fp
, file_name
);
2818 /* Open FILES (there are NFILES of them) and store the resulting array
2819 of stream pointers into (*PFPS). Allocate the array. Return the
2820 number of successfully opened files, setting errno if this value is
2821 less than NFILES. */
2824 open_input_files (struct sortfile
*files
, size_t nfiles
, FILE ***pfps
)
2826 FILE **fps
= *pfps
= xnmalloc (nfiles
, sizeof *fps
);
2829 /* Open as many input files as we can. */
2830 for (i
= 0; i
< nfiles
; i
++)
2832 fps
[i
] = (files
[i
].temp
&& files
[i
].temp
->state
!= UNCOMPRESSED
2833 ? open_temp (files
[i
].temp
)
2834 : stream_open (files
[i
].name
, "r"));
2842 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2843 files (all of which are at the start of the FILES array), and
2844 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2845 FPS is the vector of open stream corresponding to the files.
2846 Close input and output streams before returning.
2847 OUTPUT_FILE gives the name of the output file. If it is NULL,
2848 the output file is standard output. */
2851 mergefps (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
2852 FILE *ofp
, char const *output_file
, FILE **fps
)
2854 struct buffer
*buffer
= xnmalloc (nfiles
, sizeof *buffer
);
2855 /* Input buffers for each file. */
2856 struct line saved
; /* Saved line storage for unique check. */
2857 struct line
const *savedline
= NULL
;
2858 /* &saved if there is a saved line. */
2859 size_t savealloc
= 0; /* Size allocated for the saved line. */
2860 struct line
const **cur
= xnmalloc (nfiles
, sizeof *cur
);
2861 /* Current line in each line table. */
2862 struct line
const **base
= xnmalloc (nfiles
, sizeof *base
);
2863 /* Base of each line table. */
2864 size_t *ord
= xnmalloc (nfiles
, sizeof *ord
);
2865 /* Table representing a permutation of fps,
2866 such that cur[ord[0]] is the smallest line
2867 and will be next output. */
2871 struct keyfield
const *key
= keylist
;
2874 /* Read initial lines from each input file. */
2875 for (i
= 0; i
< nfiles
; )
2877 initbuf (&buffer
[i
], sizeof (struct line
),
2878 MAX (merge_buffer_size
, sort_size
/ nfiles
));
2879 if (fillbuf (&buffer
[i
], fps
[i
], files
[i
].name
))
2881 struct line
const *linelim
= buffer_linelim (&buffer
[i
]);
2882 cur
[i
] = linelim
- 1;
2883 base
[i
] = linelim
- buffer
[i
].nlines
;
2888 /* fps[i] is empty; eliminate it from future consideration. */
2889 xfclose (fps
[i
], files
[i
].name
);
2893 zaptemp (files
[i
].name
);
2895 free (buffer
[i
].buf
);
2897 for (j
= i
; j
< nfiles
; ++j
)
2899 files
[j
] = files
[j
+ 1];
2900 fps
[j
] = fps
[j
+ 1];
2905 /* Set up the ord table according to comparisons among input lines.
2906 Since this only reorders two items if one is strictly greater than
2907 the other, it is stable. */
2908 for (i
= 0; i
< nfiles
; ++i
)
2910 for (i
= 1; i
< nfiles
; ++i
)
2911 if (0 < compare (cur
[ord
[i
- 1]], cur
[ord
[i
]]))
2912 t
= ord
[i
- 1], ord
[i
- 1] = ord
[i
], ord
[i
] = t
, i
= 0;
2914 /* Repeatedly output the smallest line until no input remains. */
2917 struct line
const *smallest
= cur
[ord
[0]];
2919 /* If uniquified output is turned on, output only the first of
2920 an identical series of lines. */
2923 if (savedline
&& compare (savedline
, smallest
))
2926 write_line (&saved
, ofp
, output_file
);
2931 if (savealloc
< smallest
->length
)
2936 savealloc
= smallest
->length
;
2939 while ((savealloc
*= 2) < smallest
->length
);
2942 saved
.text
= xmalloc (savealloc
);
2944 saved
.length
= smallest
->length
;
2945 memcpy (saved
.text
, smallest
->text
, saved
.length
);
2949 saved
.text
+ (smallest
->keybeg
- smallest
->text
);
2951 saved
.text
+ (smallest
->keylim
- smallest
->text
);
2956 write_line (smallest
, ofp
, output_file
);
2958 /* Check if we need to read more lines into core. */
2959 if (base
[ord
[0]] < smallest
)
2960 cur
[ord
[0]] = smallest
- 1;
2963 if (fillbuf (&buffer
[ord
[0]], fps
[ord
[0]], files
[ord
[0]].name
))
2965 struct line
const *linelim
= buffer_linelim (&buffer
[ord
[0]]);
2966 cur
[ord
[0]] = linelim
- 1;
2967 base
[ord
[0]] = linelim
- buffer
[ord
[0]].nlines
;
2971 /* We reached EOF on fps[ord[0]]. */
2972 for (i
= 1; i
< nfiles
; ++i
)
2973 if (ord
[i
] > ord
[0])
2976 xfclose (fps
[ord
[0]], files
[ord
[0]].name
);
2977 if (ord
[0] < ntemps
)
2980 zaptemp (files
[ord
[0]].name
);
2982 free (buffer
[ord
[0]].buf
);
2983 for (i
= ord
[0]; i
< nfiles
; ++i
)
2985 fps
[i
] = fps
[i
+ 1];
2986 files
[i
] = files
[i
+ 1];
2987 buffer
[i
] = buffer
[i
+ 1];
2988 cur
[i
] = cur
[i
+ 1];
2989 base
[i
] = base
[i
+ 1];
2991 for (i
= 0; i
< nfiles
; ++i
)
2992 ord
[i
] = ord
[i
+ 1];
2997 /* The new line just read in may be larger than other lines
2998 already in main memory; push it back in the queue until we
2999 encounter a line larger than it. Optimize for the common
3000 case where the new line is smallest. */
3005 size_t ord0
= ord
[0];
3006 size_t count_of_smaller_lines
;
3010 int cmp
= compare (cur
[ord0
], cur
[ord
[probe
]]);
3011 if (cmp
< 0 || (cmp
== 0 && ord0
< ord
[probe
]))
3015 probe
= (lo
+ hi
) / 2;
3018 count_of_smaller_lines
= lo
- 1;
3019 for (j
= 0; j
< count_of_smaller_lines
; j
++)
3020 ord
[j
] = ord
[j
+ 1];
3021 ord
[count_of_smaller_lines
] = ord0
;
3025 if (unique
&& savedline
)
3027 write_line (&saved
, ofp
, output_file
);
3031 xfclose (ofp
, output_file
);
3039 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3040 files (all of which are at the start of the FILES array), and
3041 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3042 Close input and output files before returning.
3043 OUTPUT_FILE gives the name of the output file.
3045 Return the number of files successfully merged. This number can be
3046 less than NFILES if we ran low on file descriptors, but in this
3047 case it is never less than 2. */
3050 mergefiles (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3051 FILE *ofp
, char const *output_file
)
3054 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
3055 if (nopened
< nfiles
&& nopened
< 2)
3056 die (_("open failed"), files
[nopened
].name
);
3057 mergefps (files
, ntemps
, nopened
, ofp
, output_file
, fps
);
3061 /* Merge into T (of size NLINES) the two sorted arrays of lines
3062 LO (with NLINES / 2 members), and
3063 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3064 T and LO point just past their respective arrays, and the arrays
3065 are in reverse order. NLINES must be at least 2. */
3068 mergelines (struct line
*restrict t
, size_t nlines
,
3069 struct line
const *restrict lo
)
3071 size_t nlo
= nlines
/ 2;
3072 size_t nhi
= nlines
- nlo
;
3073 struct line
*hi
= t
- nlo
;
3076 if (compare (lo
- 1, hi
- 1) <= 0)
3081 /* HI must equal T now, and there is no need to copy from
3100 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3101 Do this all within one thread. NLINES must be at least 2.
3102 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3103 Otherwise the sort is in-place and TEMP is half-sized.
3104 The input and output arrays are in reverse order, and LINES and
3105 TEMP point just past the end of their respective arrays.
3107 Use a recursive divide-and-conquer algorithm, in the style
3108 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3109 the optimization suggested by exercise 5.2.4-10; this requires room
3110 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3111 writes that this memory optimization was originally published by
3112 D. A. Bell, Comp J. 1 (1958), 75. */
3115 sequential_sort (struct line
*restrict lines
, size_t nlines
,
3116 struct line
*restrict temp
, bool to_temp
)
3120 /* Declare 'swap' as int, not bool, to work around a bug
3121 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3122 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3123 int swap
= (0 < compare (&lines
[-1], &lines
[-2]));
3126 temp
[-1] = lines
[-1 - swap
];
3127 temp
[-2] = lines
[-2 + swap
];
3131 temp
[-1] = lines
[-1];
3132 lines
[-1] = lines
[-2];
3133 lines
[-2] = temp
[-1];
3138 size_t nlo
= nlines
/ 2;
3139 size_t nhi
= nlines
- nlo
;
3140 struct line
*lo
= lines
;
3141 struct line
*hi
= lines
- nlo
;
3143 sequential_sort (hi
, nhi
, temp
- (to_temp
? nlo
: 0), to_temp
);
3145 sequential_sort (lo
, nlo
, temp
, !to_temp
);
3150 struct line
const *sorted_lo
;
3161 mergelines (dest
, nlines
, sorted_lo
);
3165 static struct merge_node
*init_node (struct merge_node
*restrict
,
3166 struct merge_node
*restrict
,
3167 struct line
*, size_t, size_t, bool);
3170 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3171 lines, with destination DEST. */
3172 static struct merge_node
*
3173 merge_tree_init (size_t nthreads
, size_t nlines
, struct line
*dest
)
3175 struct merge_node
*merge_tree
= xmalloc (2 * sizeof *merge_tree
* nthreads
);
3177 struct merge_node
*root
= merge_tree
;
3178 root
->lo
= root
->hi
= root
->end_lo
= root
->end_hi
= NULL
;
3180 root
->nlo
= root
->nhi
= nlines
;
3181 root
->parent
= NULL
;
3182 root
->level
= MERGE_END
;
3183 root
->queued
= false;
3184 pthread_mutex_init (&root
->lock
, NULL
);
3186 init_node (root
, root
+ 1, dest
, nthreads
, nlines
, false);
3190 /* Destroy the merge tree. */
3192 merge_tree_destroy (struct merge_node
*merge_tree
)
3197 /* Initialize a merge tree node and its descendants. The node's
3198 parent is PARENT. The node and its descendants are taken from the
3199 array of nodes NODE_POOL. Their destination starts at DEST; they
3200 will consume NTHREADS threads. The total number of sort lines is
3201 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3204 static struct merge_node
*
3205 init_node (struct merge_node
*restrict parent
,
3206 struct merge_node
*restrict node_pool
,
3207 struct line
*dest
, size_t nthreads
,
3208 size_t total_lines
, bool is_lo_child
)
3210 size_t nlines
= (is_lo_child
? parent
->nlo
: parent
->nhi
);
3211 size_t nlo
= nlines
/ 2;
3212 size_t nhi
= nlines
- nlo
;
3213 struct line
*lo
= dest
- total_lines
;
3214 struct line
*hi
= lo
- nlo
;
3215 struct line
**parent_end
= (is_lo_child
? &parent
->end_lo
: &parent
->end_hi
);
3217 struct merge_node
*node
= node_pool
++;
3218 node
->lo
= node
->end_lo
= lo
;
3219 node
->hi
= node
->end_hi
= hi
;
3220 node
->dest
= parent_end
;
3223 node
->parent
= parent
;
3224 node
->level
= parent
->level
+ 1;
3225 node
->queued
= false;
3226 pthread_mutex_init (&node
->lock
, NULL
);
3230 size_t lo_threads
= nthreads
/ 2;
3231 size_t hi_threads
= nthreads
- lo_threads
;
3232 node
->lo_child
= node_pool
;
3233 node_pool
= init_node (node
, node_pool
, lo
, lo_threads
,
3235 node
->hi_child
= node_pool
;
3236 node_pool
= init_node (node
, node_pool
, hi
, hi_threads
,
3237 total_lines
, false);
3241 node
->lo_child
= NULL
;
3242 node
->hi_child
= NULL
;
3248 /* Compare two merge nodes A and B for priority. */
3251 compare_nodes (void const *a
, void const *b
)
3253 struct merge_node
const *nodea
= a
;
3254 struct merge_node
const *nodeb
= b
;
3255 if (nodea
->level
== nodeb
->level
)
3256 return (nodea
->nlo
+ nodea
->nhi
) < (nodeb
->nlo
+ nodeb
->nhi
);
3257 return nodea
->level
< nodeb
->level
;
3260 /* Lock a merge tree NODE. */
3263 lock_node (struct merge_node
*node
)
3265 pthread_mutex_lock (&node
->lock
);
3268 /* Unlock a merge tree NODE. */
3271 unlock_node (struct merge_node
*node
)
3273 pthread_mutex_unlock (&node
->lock
);
3276 /* Destroy merge QUEUE. */
3279 queue_destroy (struct merge_node_queue
*queue
)
3281 heap_free (queue
->priority_queue
);
3282 pthread_cond_destroy (&queue
->cond
);
3283 pthread_mutex_destroy (&queue
->mutex
);
3286 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3287 NTHREADS threads. */
3290 queue_init (struct merge_node_queue
*queue
, size_t nthreads
)
3292 /* Though it's highly unlikely all nodes are in the heap at the same
3293 time, the heap should accommodate all of them. Counting a NULL
3294 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3295 queue
->priority_queue
= heap_alloc (compare_nodes
, 2 * nthreads
);
3296 pthread_mutex_init (&queue
->mutex
, NULL
);
3297 pthread_cond_init (&queue
->cond
, NULL
);
3300 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3301 does not need to lock NODE. */
3304 queue_insert (struct merge_node_queue
*queue
, struct merge_node
*node
)
3306 pthread_mutex_lock (&queue
->mutex
);
3307 heap_insert (queue
->priority_queue
, node
);
3308 node
->queued
= true;
3309 pthread_mutex_unlock (&queue
->mutex
);
3310 pthread_cond_signal (&queue
->cond
);
3313 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3315 static struct merge_node
*
3316 queue_pop (struct merge_node_queue
*queue
)
3318 struct merge_node
*node
;
3319 pthread_mutex_lock (&queue
->mutex
);
3320 while (! (node
= heap_remove_top (queue
->priority_queue
)))
3321 pthread_cond_wait (&queue
->cond
, &queue
->mutex
);
3322 pthread_mutex_unlock (&queue
->mutex
);
3324 node
->queued
= false;
3328 /* Output LINE to TFP, unless -u is specified and the line compares
3329 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3330 is null if TFP is standard output.
3332 This function does not save the line for comparison later, so it is
3333 appropriate only for internal sort. */
3336 write_unique (struct line
const *line
, FILE *tfp
, char const *temp_output
)
3338 static struct line saved
;
3342 if (saved
.text
&& ! compare (line
, &saved
))
3347 write_line (line
, tfp
, temp_output
);
3350 /* Merge the lines currently available to a NODE in the binary
3351 merge tree. Merge a number of lines appropriate for this merge
3352 level, assuming TOTAL_LINES is the total number of lines.
3354 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3355 the name of TFP, or is null if TFP is standard output. */
3358 mergelines_node (struct merge_node
*restrict node
, size_t total_lines
,
3359 FILE *tfp
, char const *temp_output
)
3361 struct line
*lo_orig
= node
->lo
;
3362 struct line
*hi_orig
= node
->hi
;
3363 size_t to_merge
= MAX_MERGE (total_lines
, node
->level
);
3367 if (node
->level
> MERGE_ROOT
)
3369 /* Merge to destination buffer. */
3370 struct line
*dest
= *node
->dest
;
3371 while (node
->lo
!= node
->end_lo
&& node
->hi
!= node
->end_hi
&& to_merge
--)
3372 if (compare (node
->lo
- 1, node
->hi
- 1) <= 0)
3373 *--dest
= *--node
->lo
;
3375 *--dest
= *--node
->hi
;
3377 merged_lo
= lo_orig
- node
->lo
;
3378 merged_hi
= hi_orig
- node
->hi
;
3380 if (node
->nhi
== merged_hi
)
3381 while (node
->lo
!= node
->end_lo
&& to_merge
--)
3382 *--dest
= *--node
->lo
;
3383 else if (node
->nlo
== merged_lo
)
3384 while (node
->hi
!= node
->end_hi
&& to_merge
--)
3385 *--dest
= *--node
->hi
;
3390 /* Merge directly to output. */
3391 while (node
->lo
!= node
->end_lo
&& node
->hi
!= node
->end_hi
&& to_merge
--)
3393 if (compare (node
->lo
- 1, node
->hi
- 1) <= 0)
3394 write_unique (--node
->lo
, tfp
, temp_output
);
3396 write_unique (--node
->hi
, tfp
, temp_output
);
3399 merged_lo
= lo_orig
- node
->lo
;
3400 merged_hi
= hi_orig
- node
->hi
;
3402 if (node
->nhi
== merged_hi
)
3404 while (node
->lo
!= node
->end_lo
&& to_merge
--)
3405 write_unique (--node
->lo
, tfp
, temp_output
);
3407 else if (node
->nlo
== merged_lo
)
3409 while (node
->hi
!= node
->end_hi
&& to_merge
--)
3410 write_unique (--node
->hi
, tfp
, temp_output
);
3415 merged_lo
= lo_orig
- node
->lo
;
3416 merged_hi
= hi_orig
- node
->hi
;
3417 node
->nlo
-= merged_lo
;
3418 node
->nhi
-= merged_hi
;
3421 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3422 NODE's children has available lines and the other either has
3423 available lines or has exhausted its lines. */
3426 queue_check_insert (struct merge_node_queue
*queue
, struct merge_node
*node
)
3430 bool lo_avail
= (node
->lo
- node
->end_lo
) != 0;
3431 bool hi_avail
= (node
->hi
- node
->end_hi
) != 0;
3432 if (lo_avail
? hi_avail
|| ! node
->nhi
: hi_avail
&& ! node
->nlo
)
3433 queue_insert (queue
, node
);
3437 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3440 queue_check_insert_parent (struct merge_node_queue
*queue
,
3441 struct merge_node
*node
)
3443 if (node
->level
> MERGE_ROOT
)
3445 lock_node (node
->parent
);
3446 queue_check_insert (queue
, node
->parent
);
3447 unlock_node (node
->parent
);
3449 else if (node
->nlo
+ node
->nhi
== 0)
3451 /* If the MERGE_ROOT NODE has finished merging, insert the
3453 queue_insert (queue
, node
->parent
);
3457 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3458 some of those lines, until the MERGE_END node is popped.
3459 TOTAL_LINES is the total number of lines. If merging at the top
3460 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3461 null if TFP is standard output. */
3464 merge_loop (struct merge_node_queue
*queue
,
3465 size_t total_lines
, FILE *tfp
, char const *temp_output
)
3469 struct merge_node
*node
= queue_pop (queue
);
3471 if (node
->level
== MERGE_END
)
3474 /* Reinsert so other threads can pop it. */
3475 queue_insert (queue
, node
);
3478 mergelines_node (node
, total_lines
, tfp
, temp_output
);
3479 queue_check_insert (queue
, node
);
3480 queue_check_insert_parent (queue
, node
);
3487 static void sortlines (struct line
*restrict
, size_t, size_t,
3488 struct merge_node
*, struct merge_node_queue
*,
3489 FILE *, char const *);
3491 /* Thread arguments for sortlines_thread. */
3495 /* Source, i.e., the array of lines to sort. This points just past
3496 the end of the array. */
3499 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3502 /* Number of lines in LINES and DEST. */
3503 size_t const total_lines
;
3505 /* Merge node. Lines from this node and this node's sibling will merged
3506 to this node's parent. */
3507 struct merge_node
*const node
;
3509 /* The priority queue controlling available work for the entire
3511 struct merge_node_queue
*const queue
;
3513 /* If at the top level, the file to output to, and the file's name.
3514 If the file is standard output, the file's name is null. */
3516 char const *output_temp
;
3519 /* Like sortlines, except with a signature acceptable to pthread_create. */
3522 sortlines_thread (void *data
)
3524 struct thread_args
const *args
= data
;
3525 sortlines (args
->lines
, args
->nthreads
, args
->total_lines
,
3526 args
->node
, args
->queue
, args
->tfp
,
3531 /* Sort lines, possibly in parallel. The arguments are as in struct
3534 The algorithm has three phases: node creation, sequential sort,
3537 During node creation, sortlines recursively visits each node in the
3538 binary merge tree and creates a NODE structure corresponding to all the
3539 future line merging NODE is responsible for. For each call to
3540 sortlines, half the available threads are assigned to each recursive
3541 call, until a leaf node having only 1 available thread is reached.
3543 Each leaf node then performs two sequential sorts, one on each half of
3544 the lines it is responsible for. It records in its NODE structure that
3545 there are two sorted sublists available to merge from, and inserts its
3546 NODE into the priority queue.
3548 The binary merge phase then begins. Each thread drops into a loop
3549 where the thread retrieves a NODE from the priority queue, merges lines
3550 available to that NODE, and potentially insert NODE or its parent back
3551 into the queue if there are sufficient available lines for them to
3552 merge. This continues until all lines at all nodes of the merge tree
3553 have been merged. */
3556 sortlines (struct line
*restrict lines
, size_t nthreads
,
3557 size_t total_lines
, struct merge_node
*node
,
3558 struct merge_node_queue
*queue
, FILE *tfp
, char const *temp_output
)
3560 size_t nlines
= node
->nlo
+ node
->nhi
;
3562 /* Calculate thread arguments. */
3563 size_t lo_threads
= nthreads
/ 2;
3564 size_t hi_threads
= nthreads
- lo_threads
;
3566 struct thread_args args
= {lines
, lo_threads
, total_lines
,
3567 node
->lo_child
, queue
, tfp
, temp_output
};
3569 if (nthreads
> 1 && SUBTHREAD_LINES_HEURISTIC
<= nlines
3570 && pthread_create (&thread
, NULL
, sortlines_thread
, &args
) == 0)
3572 sortlines (lines
- node
->nlo
, hi_threads
, total_lines
,
3573 node
->hi_child
, queue
, tfp
, temp_output
);
3574 pthread_join (thread
, NULL
);
3578 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3579 Sort with 1 thread. */
3580 size_t nlo
= node
->nlo
;
3581 size_t nhi
= node
->nhi
;
3582 struct line
*temp
= lines
- total_lines
;
3584 sequential_sort (lines
- nlo
, nhi
, temp
- nlo
/ 2, false);
3586 sequential_sort (lines
, nlo
, temp
, false);
3588 /* Update merge NODE. No need to lock yet. */
3590 node
->hi
= lines
- nlo
;
3591 node
->end_lo
= lines
- nlo
;
3592 node
->end_hi
= lines
- nlo
- nhi
;
3594 queue_insert (queue
, node
);
3595 merge_loop (queue
, total_lines
, tfp
, temp_output
);
3598 pthread_mutex_destroy (&node
->lock
);
3601 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3602 the same as OUTFILE. If found, replace each with the same
3603 temporary copy that can be merged into OUTFILE without destroying
3604 OUTFILE before it is completely read. This temporary copy does not
3605 count as a merge temp, so don't worry about incrementing NTEMPS in
3606 the caller; final cleanup will remove it, not zaptemp.
3608 This test ensures that an otherwise-erroneous use like
3609 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3610 It's not clear that POSIX requires this nicety.
3611 Detect common error cases, but don't try to catch obscure cases like
3612 "cat ... FILE ... | sort -m -o FILE"
3613 where traditional "sort" doesn't copy the input and where
3614 people should know that they're getting into trouble anyway.
3615 Catching these obscure cases would slow down performance in
3619 avoid_trashing_input (struct sortfile
*files
, size_t ntemps
,
3620 size_t nfiles
, char const *outfile
)
3623 bool got_outstat
= false;
3624 struct stat outstat
;
3625 struct tempnode
*tempcopy
= NULL
;
3627 for (i
= ntemps
; i
< nfiles
; i
++)
3629 bool is_stdin
= STREQ (files
[i
].name
, "-");
3633 if (outfile
&& STREQ (outfile
, files
[i
].name
) && !is_stdin
)
3640 ? stat (outfile
, &outstat
)
3641 : fstat (STDOUT_FILENO
, &outstat
))
3648 ? fstat (STDIN_FILENO
, &instat
)
3649 : stat (files
[i
].name
, &instat
))
3651 && SAME_INODE (instat
, outstat
));
3659 tempcopy
= create_temp (&tftp
);
3660 mergefiles (&files
[i
], 0, 1, tftp
, tempcopy
->name
);
3663 files
[i
].name
= tempcopy
->name
;
3664 files
[i
].temp
= tempcopy
;
3669 /* Merge the input FILES. NTEMPS is the number of files at the
3670 start of FILES that are temporary; it is zero at the top level.
3671 NFILES is the total number of files. Put the output in
3672 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3675 merge (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3676 char const *output_file
)
3678 while (nmerge
< nfiles
)
3680 /* Number of input files processed so far. */
3683 /* Number of output files generated so far. */
3686 /* nfiles % NMERGE; this counts input files that are left over
3687 after all full-sized merges have been done. */
3690 /* Number of easily-available slots at the next loop iteration. */
3693 /* Do as many NMERGE-size merges as possible. In the case that
3694 nmerge is bogus, increment by the maximum number of file
3695 descriptors allowed. */
3696 for (out
= in
= 0; nmerge
<= nfiles
- in
; out
++)
3699 struct tempnode
*temp
= create_temp (&tfp
);
3700 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nmerge
),
3701 nmerge
, tfp
, temp
->name
);
3702 ntemps
-= MIN (ntemps
, num_merged
);
3703 files
[out
].name
= temp
->name
;
3704 files
[out
].temp
= temp
;
3708 remainder
= nfiles
- in
;
3709 cheap_slots
= nmerge
- out
% nmerge
;
3711 if (cheap_slots
< remainder
)
3713 /* So many files remain that they can't all be put into the last
3714 NMERGE-sized output window. Do one more merge. Merge as few
3715 files as possible, to avoid needless I/O. */
3716 size_t nshortmerge
= remainder
- cheap_slots
+ 1;
3718 struct tempnode
*temp
= create_temp (&tfp
);
3719 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nshortmerge
),
3720 nshortmerge
, tfp
, temp
->name
);
3721 ntemps
-= MIN (ntemps
, num_merged
);
3722 files
[out
].name
= temp
->name
;
3723 files
[out
++].temp
= temp
;
3727 /* Put the remaining input files into the last NMERGE-sized output
3728 window, so they will be merged in the next pass. */
3729 memmove (&files
[out
], &files
[in
], (nfiles
- in
) * sizeof *files
);
3734 avoid_trashing_input (files
, ntemps
, nfiles
, output_file
);
3736 /* We aren't guaranteed that this final mergefiles will work, therefore we
3737 try to merge into the output, and then merge as much as we can into a
3738 temp file if we can't. Repeat. */
3742 /* Merge directly into the output file if possible. */
3744 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
3746 if (nopened
== nfiles
)
3748 FILE *ofp
= stream_open (output_file
, "w");
3751 mergefps (files
, ntemps
, nfiles
, ofp
, output_file
, fps
);
3754 if (errno
!= EMFILE
|| nopened
<= 2)
3755 die (_("open failed"), output_file
);
3757 else if (nopened
<= 2)
3758 die (_("open failed"), files
[nopened
].name
);
3760 /* We ran out of file descriptors. Close one of the input
3761 files, to gain a file descriptor. Then create a temporary
3762 file with our spare file descriptor. Retry if that failed
3763 (e.g., some other process could open a file between the time
3764 we closed and tried to create). */
3766 struct tempnode
*temp
;
3770 xfclose (fps
[nopened
], files
[nopened
].name
);
3771 temp
= maybe_create_temp (&tfp
, ! (nopened
<= 2));
3775 /* Merge into the newly allocated temporary. */
3776 mergefps (&files
[0], MIN (ntemps
, nopened
), nopened
, tfp
, temp
->name
,
3778 ntemps
-= MIN (ntemps
, nopened
);
3779 files
[0].name
= temp
->name
;
3780 files
[0].temp
= temp
;
3782 memmove (&files
[1], &files
[nopened
], (nfiles
- nopened
) * sizeof *files
);
3784 nfiles
-= nopened
- 1;
3788 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3791 sort (char *const *files
, size_t nfiles
, char const *output_file
,
3795 IF_LINT (buf
.buf
= NULL
);
3797 bool output_file_created
= false;
3803 char const *temp_output
;
3804 char const *file
= *files
;
3805 FILE *fp
= xfopen (file
, "r");
3808 size_t bytes_per_line
;
3814 while (tmp
< nthreads
)
3819 bytes_per_line
= (mult
* sizeof (struct line
));
3822 bytes_per_line
= sizeof (struct line
) * 3 / 2;
3825 initbuf (&buf
, bytes_per_line
,
3826 sort_buffer_size (&fp
, 1, files
, nfiles
, bytes_per_line
));
3831 while (fillbuf (&buf
, fp
, file
))
3835 if (buf
.eof
&& nfiles
3836 && (bytes_per_line
+ 1
3837 < (buf
.alloc
- buf
.used
- bytes_per_line
* buf
.nlines
)))
3839 /* End of file, but there is more input and buffer room.
3840 Concatenate the next input file; this is faster in
3842 buf
.left
= buf
.used
;
3846 line
= buffer_linelim (&buf
);
3847 if (buf
.eof
&& !nfiles
&& !ntemps
&& !buf
.left
)
3850 tfp
= xfopen (output_file
, "w");
3851 temp_output
= output_file
;
3852 output_file_created
= true;
3857 temp_output
= create_temp (&tfp
)->name
;
3861 struct merge_node_queue queue
;
3862 queue_init (&queue
, nthreads
);
3863 struct merge_node
*merge_tree
=
3864 merge_tree_init (nthreads
, buf
.nlines
, line
);
3865 struct merge_node
*root
= merge_tree
+ 1;
3867 sortlines (line
, nthreads
, buf
.nlines
, root
,
3868 &queue
, tfp
, temp_output
);
3869 queue_destroy (&queue
);
3870 pthread_mutex_destroy (&root
->lock
);
3871 merge_tree_destroy (merge_tree
);
3874 write_unique (line
- 1, tfp
, temp_output
);
3876 xfclose (tfp
, temp_output
);
3878 if (output_file_created
)
3887 if (! output_file_created
)
3890 struct tempnode
*node
= temphead
;
3891 struct sortfile
*tempfiles
= xnmalloc (ntemps
, sizeof *tempfiles
);
3892 for (i
= 0; node
; i
++)
3894 tempfiles
[i
].name
= node
->name
;
3895 tempfiles
[i
].temp
= node
;
3898 merge (tempfiles
, ntemps
, ntemps
, output_file
);
3905 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3908 insertkey (struct keyfield
*key_arg
)
3910 struct keyfield
**p
;
3911 struct keyfield
*key
= xmemdup (key_arg
, sizeof *key
);
3913 for (p
= &keylist
; *p
; p
= &(*p
)->next
)
3919 /* Report a bad field specification SPEC, with extra info MSGID. */
3921 static void badfieldspec (char const *, char const *)
3924 badfieldspec (char const *spec
, char const *msgid
)
3926 error (SORT_FAILURE
, 0, _("%s: invalid field specification %s"),
3927 _(msgid
), quote (spec
));
3931 /* Report incompatible options. */
3933 static void incompatible_options (char const *) ATTRIBUTE_NORETURN
;
3935 incompatible_options (char const *opts
)
3937 error (SORT_FAILURE
, 0, _("options '-%s' are incompatible"), opts
);
3941 /* Check compatibility of ordering options. */
3944 check_ordering_compatibility (void)
3946 struct keyfield
*key
;
3948 for (key
= keylist
; key
; key
= key
->next
)
3949 if (1 < (key
->numeric
+ key
->general_numeric
+ key
->human_numeric
3950 + key
->month
+ (key
->version
| key
->random
| !!key
->ignore
)))
3952 /* The following is too big, but guaranteed to be "big enough". */
3953 char opts
[sizeof short_options
];
3954 /* Clear flags we're not interested in. */
3955 key
->skipsblanks
= key
->skipeblanks
= key
->reverse
= false;
3956 key_to_opts (key
, opts
);
3957 incompatible_options (opts
);
3961 /* Parse the leading integer in STRING and store the resulting value
3962 (which must fit into size_t) into *VAL. Return the address of the
3963 suffix after the integer. If the value is too large, silently
3964 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3965 failure; otherwise, report MSGID and exit on failure. */
3968 parse_field_count (char const *string
, size_t *val
, char const *msgid
)
3973 switch (xstrtoumax (string
, &suffix
, 10, &n
, ""))
3976 case LONGINT_INVALID_SUFFIX_CHAR
:
3981 case LONGINT_OVERFLOW
:
3982 case LONGINT_OVERFLOW
| LONGINT_INVALID_SUFFIX_CHAR
:
3986 case LONGINT_INVALID
:
3988 error (SORT_FAILURE
, 0, _("%s: invalid count at start of %s"),
3989 _(msgid
), quote (string
));
3996 /* Handle interrupts and hangups. */
3999 sighandler (int sig
)
4002 signal (sig
, SIG_IGN
);
4006 signal (sig
, SIG_DFL
);
4010 /* Set the ordering options for KEY specified in S.
4011 Return the address of the first character in S that
4012 is not a valid ordering option.
4013 BLANKTYPE is the kind of blanks that 'b' should skip. */
4016 set_ordering (char const *s
, struct keyfield
*key
, enum blanktype blanktype
)
4023 if (blanktype
== bl_start
|| blanktype
== bl_both
)
4024 key
->skipsblanks
= true;
4025 if (blanktype
== bl_end
|| blanktype
== bl_both
)
4026 key
->skipeblanks
= true;
4029 key
->ignore
= nondictionary
;
4032 key
->translate
= fold_toupper
;
4035 key
->general_numeric
= true;
4038 key
->human_numeric
= true;
4041 /* Option order should not matter, so don't let -i override
4042 -d. -d implies -i, but -i does not imply -d. */
4044 key
->ignore
= nonprinting
;
4050 key
->numeric
= true;
4056 key
->reverse
= true;
4059 key
->version
= true;
4069 /* Initialize KEY. */
4071 static struct keyfield
*
4072 key_init (struct keyfield
*key
)
4074 memset (key
, 0, sizeof *key
);
4075 key
->eword
= SIZE_MAX
;
4080 main (int argc
, char **argv
)
4082 struct keyfield
*key
;
4083 struct keyfield key_buf
;
4084 struct keyfield gkey
;
4085 bool gkey_only
= false;
4089 bool mergeonly
= false;
4090 char *random_source
= NULL
;
4091 bool need_random
= false;
4092 size_t nthreads
= 0;
4094 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != NULL
);
4095 bool obsolete_usage
= (posix2_version () < 200112);
4097 char *files_from
= NULL
;
4099 char const *outfile
= NULL
;
4101 initialize_main (&argc
, &argv
);
4102 set_program_name (argv
[0]);
4103 setlocale (LC_ALL
, "");
4104 bindtextdomain (PACKAGE
, LOCALEDIR
);
4105 textdomain (PACKAGE
);
4107 initialize_exit_failure (SORT_FAILURE
);
4109 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
4110 #if HAVE_NL_LANGINFO
4111 hard_LC_TIME
= hard_locale (LC_TIME
);
4114 /* Get locale's representation of the decimal point. */
4116 struct lconv
const *locale
= localeconv ();
4118 /* If the locale doesn't define a decimal point, or if the decimal
4119 point is multibyte, use the C locale's decimal point. FIXME:
4120 add support for multibyte decimal points. */
4121 decimal_point
= to_uchar (locale
->decimal_point
[0]);
4122 if (! decimal_point
|| locale
->decimal_point
[1])
4123 decimal_point
= '.';
4125 /* FIXME: add support for multibyte thousands separators. */
4126 thousands_sep
= to_uchar (*locale
->thousands_sep
);
4127 if (! thousands_sep
|| locale
->thousands_sep
[1])
4131 have_read_stdin
= false;
4136 static int const sig
[] =
4138 /* The usual suspects. */
4139 SIGALRM
, SIGHUP
, SIGINT
, SIGPIPE
, SIGQUIT
, SIGTERM
,
4156 enum { nsigs
= ARRAY_CARDINALITY (sig
) };
4159 struct sigaction act
;
4161 sigemptyset (&caught_signals
);
4162 for (i
= 0; i
< nsigs
; i
++)
4164 sigaction (sig
[i
], NULL
, &act
);
4165 if (act
.sa_handler
!= SIG_IGN
)
4166 sigaddset (&caught_signals
, sig
[i
]);
4169 act
.sa_handler
= sighandler
;
4170 act
.sa_mask
= caught_signals
;
4173 for (i
= 0; i
< nsigs
; i
++)
4174 if (sigismember (&caught_signals
, sig
[i
]))
4175 sigaction (sig
[i
], &act
, NULL
);
4177 for (i
= 0; i
< nsigs
; i
++)
4178 if (signal (sig
[i
], SIG_IGN
) != SIG_IGN
)
4180 signal (sig
[i
], sighandler
);
4181 siginterrupt (sig
[i
], 1);
4185 signal (SIGCHLD
, SIG_DFL
); /* Don't inherit CHLD handling from parent. */
4187 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4188 atexit (exit_cleanup
);
4191 gkey
.sword
= SIZE_MAX
;
4193 files
= xnmalloc (argc
, sizeof *files
);
4197 /* Parse an operand as a file after "--" was seen; or if
4198 pedantic and a file was seen, unless the POSIX version
4199 predates 1003.1-2001 and -c was not seen and the operand is
4200 "-o FILE" or "-oFILE". */
4204 || (posixly_correct
&& nfiles
!= 0
4205 && ! (obsolete_usage
4208 && argv
[optind
][0] == '-' && argv
[optind
][1] == 'o'
4209 && (argv
[optind
][2] || optind
+ 1 != argc
)))
4210 || ((c
= getopt_long (argc
, argv
, short_options
,
4216 files
[nfiles
++] = argv
[optind
++];
4222 if (optarg
[0] == '+')
4224 bool minus_pos_usage
= (optind
!= argc
&& argv
[optind
][0] == '-'
4225 && ISDIGIT (argv
[optind
][1]));
4226 obsolete_usage
|= minus_pos_usage
&& !posixly_correct
;
4229 /* Treat +POS1 [-POS2] as a key if possible; but silently
4230 treat an operand as a file if it is not a valid +POS1. */
4231 key
= key_init (&key_buf
);
4232 s
= parse_field_count (optarg
+ 1, &key
->sword
, NULL
);
4234 s
= parse_field_count (s
+ 1, &key
->schar
, NULL
);
4235 if (! (key
->sword
|| key
->schar
))
4236 key
->sword
= SIZE_MAX
;
4237 if (! s
|| *set_ordering (s
, key
, bl_start
))
4241 if (minus_pos_usage
)
4243 char const *optarg1
= argv
[optind
++];
4244 s
= parse_field_count (optarg1
+ 1, &key
->eword
,
4245 N_("invalid number after '-'"));
4246 /* When called with a non-NULL message ID,
4247 parse_field_count cannot return NULL. Tell static
4248 analysis tools that dereferencing S is safe. */
4251 s
= parse_field_count (s
+ 1, &key
->echar
,
4252 N_("invalid number after '.'"));
4253 if (!key
->echar
&& key
->eword
)
4255 /* obsolescent syntax +A.x -B.y is equivalent to:
4256 -k A+1.x+1,B.y (when y = 0)
4257 -k A+1.x+1,B+1.y (when y > 0)
4258 So eword is decremented as in the -k case
4259 only when the end field (B) is specified and
4263 if (*set_ordering (s
, key
, bl_end
))
4264 badfieldspec (optarg1
,
4265 N_("stray character in field spec"));
4267 key
->obsolete_used
= true;
4273 files
[nfiles
++] = optarg
;
4277 c
= XARGMATCH ("--sort", optarg
, sort_args
, sort_types
);
4294 set_ordering (str
, &gkey
, bl_both
);
4300 ? XARGMATCH ("--check", optarg
, check_args
, check_types
)
4305 if (checkonly
&& checkonly
!= c
)
4306 incompatible_options ("cC");
4310 case COMPRESS_PROGRAM_OPTION
:
4311 if (compress_program
&& !STREQ (compress_program
, optarg
))
4312 error (SORT_FAILURE
, 0, _("multiple compress programs specified"));
4313 compress_program
= optarg
;
4316 case DEBUG_PROGRAM_OPTION
:
4320 case FILES0_FROM_OPTION
:
4321 files_from
= optarg
;
4325 key
= key_init (&key_buf
);
4328 s
= parse_field_count (optarg
, &key
->sword
,
4329 N_("invalid number at field start"));
4332 /* Provoke with 'sort -k0' */
4333 badfieldspec (optarg
, N_("field number is zero"));
4337 s
= parse_field_count (s
+ 1, &key
->schar
,
4338 N_("invalid number after '.'"));
4341 /* Provoke with 'sort -k1.0' */
4342 badfieldspec (optarg
, N_("character offset is zero"));
4345 if (! (key
->sword
|| key
->schar
))
4346 key
->sword
= SIZE_MAX
;
4347 s
= set_ordering (s
, key
, bl_start
);
4350 key
->eword
= SIZE_MAX
;
4356 s
= parse_field_count (s
+ 1, &key
->eword
,
4357 N_("invalid number after ','"));
4360 /* Provoke with 'sort -k1,0' */
4361 badfieldspec (optarg
, N_("field number is zero"));
4365 s
= parse_field_count (s
+ 1, &key
->echar
,
4366 N_("invalid number after '.'"));
4368 s
= set_ordering (s
, key
, bl_end
);
4371 badfieldspec (optarg
, N_("stray character in field spec"));
4380 specify_nmerge (oi
, c
, optarg
);
4384 if (outfile
&& !STREQ (outfile
, optarg
))
4385 error (SORT_FAILURE
, 0, _("multiple output files specified"));
4389 case RANDOM_SOURCE_OPTION
:
4390 if (random_source
&& !STREQ (random_source
, optarg
))
4391 error (SORT_FAILURE
, 0, _("multiple random sources specified"));
4392 random_source
= optarg
;
4400 specify_sort_size (oi
, c
, optarg
);
4405 char newtab
= optarg
[0];
4407 error (SORT_FAILURE
, 0, _("empty tab"));
4410 if (STREQ (optarg
, "\\0"))
4414 /* Provoke with 'sort -txx'. Complain about
4415 "multi-character tab" instead of "multibyte tab", so
4416 that the diagnostic's wording does not need to be
4417 changed once multibyte characters are supported. */
4418 error (SORT_FAILURE
, 0, _("multi-character tab %s"),
4422 if (tab
!= TAB_DEFAULT
&& tab
!= newtab
)
4423 error (SORT_FAILURE
, 0, _("incompatible tabs"));
4429 add_temp_dir (optarg
);
4432 case PARALLEL_OPTION
:
4433 nthreads
= specify_nthreads (oi
, c
, optarg
);
4441 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4442 through Solaris 7. It is also accepted by many non-Solaris
4443 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4444 -y is marked as obsolete starting with Solaris 8 (1999), but is
4445 still accepted as of Solaris 10 prerelease (2004).
4447 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4448 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4449 and which in general ignores the argument after "-y" if it
4450 consists entirely of digits (it can even be empty). */
4451 if (optarg
== argv
[optind
- 1])
4454 for (p
= optarg
; ISDIGIT (*p
); p
++)
4456 optind
-= (*p
!= '\0');
4464 case_GETOPT_HELP_CHAR
;
4466 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
4469 usage (SORT_FAILURE
);
4477 /* When using --files0-from=F, you may not specify any files
4478 on the command-line. */
4481 error (0, 0, _("extra operand %s"), quote (files
[0]));
4482 fprintf (stderr
, "%s\n",
4483 _("file operands cannot be combined with --files0-from"));
4484 usage (SORT_FAILURE
);
4487 if (STREQ (files_from
, "-"))
4491 stream
= fopen (files_from
, "r");
4493 error (SORT_FAILURE
, errno
, _("cannot open %s for reading"),
4494 quote (files_from
));
4497 readtokens0_init (&tok
);
4499 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
4500 error (SORT_FAILURE
, 0, _("cannot read file names from %s"),
4501 quote (files_from
));
4509 for (i
= 0; i
< nfiles
; i
++)
4511 if (STREQ (files
[i
], "-"))
4512 error (SORT_FAILURE
, 0, _("when reading file names from stdin, "
4513 "no file name of %s allowed"),
4515 else if (files
[i
][0] == '\0')
4517 /* Using the standard 'filename:line-number:' prefix here is
4518 not totally appropriate, since NUL is the separator,
4519 not NL, but it might be better than nothing. */
4520 unsigned long int file_number
= i
+ 1;
4521 error (SORT_FAILURE
, 0,
4522 _("%s:%lu: invalid zero-length file name"),
4523 quotearg_colon (files_from
), file_number
);
4528 error (SORT_FAILURE
, 0, _("no input from %s"),
4529 quote (files_from
));
4532 /* Inheritance of global options to individual keys. */
4533 for (key
= keylist
; key
; key
= key
->next
)
4535 if (default_key_compare (key
) && !key
->reverse
)
4537 key
->ignore
= gkey
.ignore
;
4538 key
->translate
= gkey
.translate
;
4539 key
->skipsblanks
= gkey
.skipsblanks
;
4540 key
->skipeblanks
= gkey
.skipeblanks
;
4541 key
->month
= gkey
.month
;
4542 key
->numeric
= gkey
.numeric
;
4543 key
->general_numeric
= gkey
.general_numeric
;
4544 key
->human_numeric
= gkey
.human_numeric
;
4545 key
->version
= gkey
.version
;
4546 key
->random
= gkey
.random
;
4547 key
->reverse
= gkey
.reverse
;
4550 need_random
|= key
->random
;
4553 if (!keylist
&& !default_key_compare (&gkey
))
4557 need_random
|= gkey
.random
;
4560 check_ordering_compatibility ();
4564 if (checkonly
|| outfile
)
4566 static char opts
[] = "X --debug";
4567 opts
[0] = (checkonly
? checkonly
: 'o');
4568 incompatible_options (opts
);
4571 /* Always output the locale in debug mode, since this
4572 is such a common source of confusion. */
4573 if (hard_LC_COLLATE
)
4574 error (0, 0, _("using %s sorting rules"),
4575 quote (setlocale (LC_COLLATE
, NULL
)));
4577 error (0, 0, _("using simple byte comparison"));
4578 key_warnings (&gkey
, gkey_only
);
4581 reverse
= gkey
.reverse
;
4584 random_md5_state_init (random_source
);
4586 if (temp_dir_count
== 0)
4588 char const *tmp_dir
= getenv ("TMPDIR");
4589 add_temp_dir (tmp_dir
? tmp_dir
: DEFAULT_TMPDIR
);
4594 static char *minus
= (char *) "-";
4600 /* Need to re-check that we meet the minimum requirement for memory
4601 usage with the final value for NMERGE. */
4603 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
4608 error (SORT_FAILURE
, 0, _("extra operand %s not allowed with -%c"),
4609 quote (files
[1]), checkonly
);
4613 static char opts
[] = {0, 'o', 0};
4614 opts
[0] = checkonly
;
4615 incompatible_options (opts
);
4618 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4619 input is not properly sorted. */
4620 exit (check (files
[0], checkonly
) ? EXIT_SUCCESS
: SORT_OUT_OF_ORDER
);
4625 struct sortfile
*sortfiles
= xcalloc (nfiles
, sizeof *sortfiles
);
4628 for (i
= 0; i
< nfiles
; ++i
)
4629 sortfiles
[i
].name
= files
[i
];
4631 merge (sortfiles
, 0, nfiles
, outfile
);
4632 IF_LINT (free (sortfiles
));
4638 unsigned long int np
= num_processors (NPROC_CURRENT_OVERRIDABLE
);
4639 nthreads
= MIN (np
, DEFAULT_MAX_THREADS
);
4642 /* Avoid integer overflow later. */
4643 size_t nthreads_max
= SIZE_MAX
/ (2 * sizeof (struct merge_node
));
4644 nthreads
= MIN (nthreads
, nthreads_max
);
4646 sort (files
, nfiles
, outfile
, nthreads
);
4649 if (have_read_stdin
&& fclose (stdin
) == EOF
)
4650 die (_("close failed"), "-");
4652 exit (EXIT_SUCCESS
);