1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
28 #include <sys/resource.h>
29 #include <sys/types.h>
36 #include "filevercmp.h"
37 #include "flexmember.h"
38 #include "hard-locale.h"
41 #include "ignore-value.h"
48 #include "readtokens0.h"
50 #include "strnumcmp.h"
52 #include "xnanosleep.h"
54 #include "xstrtol-error.h"
57 struct rlimit
{ size_t rlim_cur
; };
58 # define getrlimit(Resource, Rlp) (-1)
61 /* The official name of this program (e.g., no 'g' prefix). */
62 #define PROGRAM_NAME "sort"
65 proper_name ("Mike Haertel"), \
66 proper_name ("Paul Eggert")
68 #if HAVE_LANGINFO_CODESET
69 # include <langinfo.h>
72 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
75 # define SA_NOCLDSTOP 0
76 /* No sigprocmask. Always 'return' zero. */
77 # define sigprocmask(How, Set, Oset) (0)
79 # if ! HAVE_SIGINTERRUPT
80 # define siginterrupt(sig, flag) /* empty */
84 #if !defined OPEN_MAX && defined NR_OPEN
85 # define OPEN_MAX NR_OPEN
91 #define UCHAR_LIM (UCHAR_MAX + 1)
93 #ifndef DEFAULT_TMPDIR
94 # define DEFAULT_TMPDIR "/tmp"
97 /* Maximum number of lines to merge every time a NODE is taken from
98 the merge queue. Node is at LEVEL in the binary merge tree,
99 and is responsible for merging TOTAL lines. */
100 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
102 /* Heuristic value for the number of lines for which it is worth creating
103 a subthread, during an internal merge sort. I.e., it is a small number
104 of "average" lines for which sorting via two threads is faster than
105 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
106 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
107 lines of gensort -a output is sorted slightly faster with --parallel=2
108 than with --parallel=1. By contrast, using --parallel=1 is about 10%
109 faster than using --parallel=2 with a 64K-line input. */
110 enum { SUBTHREAD_LINES_HEURISTIC
= 128 * 1024 };
111 static_assert (4 <= SUBTHREAD_LINES_HEURISTIC
);
113 /* The number of threads after which there are
114 diminishing performance gains. */
115 enum { DEFAULT_MAX_THREADS
= 8 };
120 /* POSIX says to exit with status 1 if invoked with -c and the
121 input is not properly sorted. */
122 SORT_OUT_OF_ORDER
= 1,
124 /* POSIX says any other irregular exit must exit with a status
125 code greater than 1. */
131 /* The number of times we should try to fork a compression process
132 (we retry if the fork call fails). We don't _need_ to compress
133 temp files, this is just to reduce file system access, so this number
134 can be small. Each retry doubles in duration. */
135 MAX_FORK_TRIES_COMPRESS
= 4,
137 /* The number of times we should try to fork a decompression process.
138 If we can't fork a decompression process, we can't sort, so this
139 number should be big. Each retry doubles in duration. */
140 MAX_FORK_TRIES_DECOMPRESS
= 9
145 /* Level of the end-of-merge node, one level above the root. */
148 /* Level of the root node in merge tree. */
152 /* The representation of the decimal point in the current locale. */
153 static char decimal_point
;
155 /* Thousands separator; if outside char range, there is no separator. */
156 static int thousands_sep
;
157 /* We currently ignore multi-byte grouping chars. */
158 static bool thousands_sep_ignored
;
160 /* Nonzero if the corresponding locales are hard. */
161 static bool hard_LC_COLLATE
;
163 static bool hard_LC_TIME
;
166 #define NONZERO(x) ((x) != 0)
168 /* The kind of blanks for '-b' to skip in various options. */
169 enum blanktype
{ bl_start
, bl_end
, bl_both
};
171 /* The character marking end of line. Default to \n. */
172 static char eolchar
= '\n';
174 /* Lines are held in memory as counted strings. */
177 char *text
; /* Text of the line. */
178 size_t length
; /* Length including final newline. */
179 char *keybeg
; /* Start of first key. */
180 char *keylim
; /* Limit of first key. */
186 char *buf
; /* Dynamically allocated buffer,
187 partitioned into 3 regions:
190 - an array of lines, in reverse order. */
191 size_t used
; /* Number of bytes used for input data. */
192 size_t nlines
; /* Number of lines in the line array. */
193 size_t alloc
; /* Number of bytes allocated. */
194 size_t left
; /* Number of bytes left from previous reads. */
195 size_t line_bytes
; /* Number of bytes to reserve for each line. */
196 bool eof
; /* An EOF has been read. */
202 size_t sword
; /* Zero-origin 'word' to start at. */
203 size_t schar
; /* Additional characters to skip. */
204 size_t eword
; /* Zero-origin last 'word' of key. */
205 size_t echar
; /* Additional characters in field. */
206 bool const *ignore
; /* Boolean array of characters to ignore. */
207 char const *translate
; /* Translation applied to characters. */
208 bool skipsblanks
; /* Skip leading blanks when finding start. */
209 bool skipeblanks
; /* Skip leading blanks when finding end. */
210 bool numeric
; /* Flag for numeric comparison. Handle
211 strings of digits with optional decimal
212 point, but no exponential notation. */
213 bool random
; /* Sort by random hash of key. */
214 bool general_numeric
; /* Flag for general, numeric comparison.
215 Handle numbers in exponential notation. */
216 bool human_numeric
; /* Flag for sorting by human readable
217 units with either SI or IEC prefixes. */
218 bool month
; /* Flag for comparison by month name. */
219 bool reverse
; /* Reverse the sense of comparison. */
220 bool version
; /* sort by version number */
221 bool traditional_used
; /* Traditional key option format is used. */
222 struct keyfield
*next
; /* Next keyfield to try. */
231 /* Binary merge tree node. */
234 struct line
*lo
; /* Lines to merge from LO child node. */
235 struct line
*hi
; /* Lines to merge from HI child node. */
236 struct line
*end_lo
; /* End of available lines from LO. */
237 struct line
*end_hi
; /* End of available lines from HI. */
238 struct line
**dest
; /* Pointer to destination of merge. */
239 size_t nlo
; /* Total Lines remaining from LO. */
240 size_t nhi
; /* Total lines remaining from HI. */
241 struct merge_node
*parent
; /* Parent node. */
242 struct merge_node
*lo_child
; /* LO child node. */
243 struct merge_node
*hi_child
; /* HI child node. */
244 unsigned int level
; /* Level in merge tree. */
245 bool queued
; /* Node is already in heap. */
246 pthread_mutex_t lock
; /* Lock for node operations. */
249 /* Priority queue of merge nodes. */
250 struct merge_node_queue
252 struct heap
*priority_queue
; /* Priority queue of merge tree nodes. */
253 pthread_mutex_t mutex
; /* Lock for queue operations. */
254 pthread_cond_t cond
; /* Conditional wait for empty queue to populate
258 /* Used to implement --unique (-u). */
259 static struct line saved_line
;
261 /* FIXME: None of these tables work with multibyte character sets.
262 Also, there are many other bugs when handling multibyte characters.
263 One way to fix this is to rewrite 'sort' to use wide characters
264 internally, but doing this with good performance is a bit
267 /* Table of blanks. */
268 static bool blanks
[UCHAR_LIM
];
270 /* Table of non-printing characters. */
271 static bool nonprinting
[UCHAR_LIM
];
273 /* Table of non-dictionary characters (not letters, digits, or blanks). */
274 static bool nondictionary
[UCHAR_LIM
];
276 /* Translation table folding lower case to upper. */
277 static char fold_toupper
[UCHAR_LIM
];
279 #define MONTHS_PER_YEAR 12
281 /* Table mapping month names to integers.
282 Alphabetic order allows binary search. */
283 static struct month monthtab
[] =
299 /* During the merge phase, the number of files to merge at once. */
300 #define NMERGE_DEFAULT 16
302 /* Minimum size for a merge or check buffer. */
303 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
305 /* Minimum sort size; the code might not work with smaller sizes. */
306 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
308 /* The number of bytes needed for a merge or check buffer, which can
309 function relatively efficiently even if it holds only one line. If
310 a longer line is seen, this value is increased. */
311 static size_t merge_buffer_size
= MAX (MIN_MERGE_BUFFER_SIZE
, 256 * 1024);
313 /* The approximate maximum number of bytes of main memory to use, as
314 specified by the user. Zero if the user has not specified a size. */
315 static size_t sort_size
;
317 /* The initial allocation factor for non-regular files.
318 This is used, e.g., when reading from a pipe.
319 Don't make it too big, since it is multiplied by ~130 to
320 obtain the size of the actual buffer sort will allocate.
321 Also, there may be 8 threads all doing this at the same time. */
322 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
324 /* Array of directory names in which any temporary files are to be created. */
325 static char const **temp_dirs
;
327 /* Number of temporary directory names used. */
328 static size_t temp_dir_count
;
330 /* Number of allocated slots in temp_dirs. */
331 static size_t temp_dir_alloc
;
333 /* Flag to reverse the order of all comparisons. */
336 /* Flag for stable sort. This turns off the last ditch bytewise
337 comparison of lines, and instead leaves lines in the same order
338 they were read if all keys compare equal. */
341 /* An int value outside char range. */
342 enum { NON_CHAR
= CHAR_MAX
+ 1 };
344 /* If TAB has this value, blanks separate fields. */
345 enum { TAB_DEFAULT
= CHAR_MAX
+ 1 };
347 /* Tab character separating fields. If TAB_DEFAULT, then fields are
348 separated by the empty string between a non-blank character and a blank
350 static int tab
= TAB_DEFAULT
;
352 /* Flag to remove consecutive duplicate lines from the output.
353 Only the last of a sequence of equal lines will be output. */
356 /* Nonzero if any of the input files are the standard input. */
357 static bool have_read_stdin
;
359 /* List of key field comparisons to be tried. */
360 static struct keyfield
*keylist
;
362 /* Program used to (de)compress temp files. Must accept -d. */
363 static char const *compress_program
;
365 /* Annotate the output with extra info to aid the user. */
368 /* Maximum number of files to merge in one go. If more than this
369 number are present, temp files will be used. */
370 static unsigned int nmerge
= NMERGE_DEFAULT
;
372 /* Output an error to stderr and exit using async-signal-safe routines.
373 This can be used safely from signal handlers,
374 and between fork and exec of multithreaded processes. */
376 static _Noreturn
void
377 async_safe_die (int errnum
, char const *errstr
)
379 ignore_value (write (STDERR_FILENO
, errstr
, strlen (errstr
)));
381 /* Even if defined HAVE_STRERROR_R, we can't use it,
382 as it may return a translated string etc. and even if not
383 may call malloc which is unsafe. We might improve this
384 by testing for sys_errlist and using that if available.
385 For now just report the error number. */
388 char errbuf
[INT_BUFSIZE_BOUND (errnum
)];
389 char *p
= inttostr (errnum
, errbuf
);
390 ignore_value (write (STDERR_FILENO
, ": errno ", 8));
391 ignore_value (write (STDERR_FILENO
, p
, strlen (p
)));
394 ignore_value (write (STDERR_FILENO
, "\n", 1));
396 _exit (SORT_FAILURE
);
399 /* Report MESSAGE for FILE, then clean up and exit.
400 If FILE is null, it represents standard output. */
403 sort_die (char const *message
, char const *file
)
405 error (SORT_FAILURE
, errno
, "%s: %s", message
,
406 quotef (file
? file
: _("standard output")));
412 if (status
!= EXIT_SUCCESS
)
417 Usage: %s [OPTION]... [FILE]...\n\
418 or: %s [OPTION]... --files0-from=F\n\
420 program_name
, program_name
);
422 Write sorted concatenation of all FILE(s) to standard output.\n\
426 emit_mandatory_arg_note ();
433 -b, --ignore-leading-blanks ignore leading blanks\n\
434 -d, --dictionary-order consider only blanks and alphanumeric characters\
436 -f, --ignore-case fold lower case to upper case characters\n\
439 -g, --general-numeric-sort compare according to general numerical value\n\
440 -i, --ignore-nonprinting consider only printable characters\n\
441 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
444 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
447 -n, --numeric-sort compare according to string numerical value;\n\
448 see manual for which strings are supported\n\
449 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
450 --random-source=FILE get random bytes from FILE\n\
451 -r, --reverse reverse the result of comparisons\n\
454 --sort=WORD sort according to WORD:\n\
455 general-numeric -g, human-numeric -h, month -M,\
457 numeric -n, random -R, version -V\n\
458 -V, --version-sort natural sort of (version) numbers within text\n\
466 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
467 for more use temp files\n\
470 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
471 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
473 --compress-program=PROG compress temporaries with PROG;\n\
474 decompress them with PROG -d\n\
477 --debug annotate the part of the line used to sort,\n\
478 and warn about questionable usage to stderr\n\
479 --files0-from=F read input from the files specified by\n\
480 NUL-terminated names in file F;\n\
481 If F is - then read names from standard input\n\
484 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
485 -m, --merge merge already sorted files; do not sort\n\
488 -o, --output=FILE write result to FILE instead of standard output\n\
489 -s, --stable stabilize sort by disabling last-resort comparison\
491 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
494 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
495 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
496 multiple options specify multiple directories\n\
497 --parallel=N change the number of sorts run concurrently to N\n\
498 -u, --unique with -c, check for strict ordering;\n\
499 without -c, output only the first of an equal run\
503 -z, --zero-terminated line delimiter is NUL, not newline\n\
505 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
506 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
509 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
510 field number and C a character position in the field; both are origin 1, and\n\
511 the stop position defaults to the line's end. If neither -t nor -b is in\n\
512 effect, characters in a field are counted from the beginning of the preceding\n\
513 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
515 which override global ordering options for that key. If no key is given, use\n\
516 the entire line as the key. Use --debug to diagnose incorrect key usage.\n\
518 SIZE may be followed by the following multiplicative suffixes:\n\
521 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y, R, Q.\
524 The locale specified by the environment affects sort order.\n\
525 Set LC_ALL=C to get the traditional sort order that uses\n\
526 native byte values.\n\
528 emit_ancillary_info (PROGRAM_NAME
);
534 /* For long options that have no equivalent short option, use a
535 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
538 CHECK_OPTION
= CHAR_MAX
+ 1,
539 COMPRESS_PROGRAM_OPTION
,
540 DEBUG_PROGRAM_OPTION
,
543 RANDOM_SOURCE_OPTION
,
548 static char const short_options
[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
550 static struct option
const long_options
[] =
552 {"ignore-leading-blanks", no_argument
, nullptr, 'b'},
553 {"check", optional_argument
, nullptr, CHECK_OPTION
},
554 {"compress-program", required_argument
, nullptr, COMPRESS_PROGRAM_OPTION
},
555 {"debug", no_argument
, nullptr, DEBUG_PROGRAM_OPTION
},
556 {"dictionary-order", no_argument
, nullptr, 'd'},
557 {"ignore-case", no_argument
, nullptr, 'f'},
558 {"files0-from", required_argument
, nullptr, FILES0_FROM_OPTION
},
559 {"general-numeric-sort", no_argument
, nullptr, 'g'},
560 {"ignore-nonprinting", no_argument
, nullptr, 'i'},
561 {"key", required_argument
, nullptr, 'k'},
562 {"merge", no_argument
, nullptr, 'm'},
563 {"month-sort", no_argument
, nullptr, 'M'},
564 {"numeric-sort", no_argument
, nullptr, 'n'},
565 {"human-numeric-sort", no_argument
, nullptr, 'h'},
566 {"version-sort", no_argument
, nullptr, 'V'},
567 {"random-sort", no_argument
, nullptr, 'R'},
568 {"random-source", required_argument
, nullptr, RANDOM_SOURCE_OPTION
},
569 {"sort", required_argument
, nullptr, SORT_OPTION
},
570 {"output", required_argument
, nullptr, 'o'},
571 {"reverse", no_argument
, nullptr, 'r'},
572 {"stable", no_argument
, nullptr, 's'},
573 {"batch-size", required_argument
, nullptr, NMERGE_OPTION
},
574 {"buffer-size", required_argument
, nullptr, 'S'},
575 {"field-separator", required_argument
, nullptr, 't'},
576 {"temporary-directory", required_argument
, nullptr, 'T'},
577 {"unique", no_argument
, nullptr, 'u'},
578 {"zero-terminated", no_argument
, nullptr, 'z'},
579 {"parallel", required_argument
, nullptr, PARALLEL_OPTION
},
580 {GETOPT_HELP_OPTION_DECL
},
581 {GETOPT_VERSION_OPTION_DECL
},
582 {nullptr, 0, nullptr, 0},
585 #define CHECK_TABLE \
587 _ct_("silent", 'C') \
588 _ct_("diagnose-first", 'c')
590 static char const *const check_args
[] =
592 #define _ct_(_s, _c) _s,
596 static char const check_types
[] =
598 #define _ct_(_s, _c) _c,
604 _st_("general-numeric", 'g') \
605 _st_("human-numeric", 'h') \
607 _st_("numeric", 'n') \
608 _st_("random", 'R') \
611 static char const *const sort_args
[] =
613 #define _st_(_s, _c) _s,
617 static char const sort_types
[] =
619 #define _st_(_s, _c) _c,
624 /* The set of signals that are caught. */
625 static sigset_t caught_signals
;
627 /* Critical section status. */
634 /* Enter a critical section. */
636 cs_enter (struct cs_status
*status
)
638 int ret
= pthread_sigmask (SIG_BLOCK
, &caught_signals
, &status
->sigs
);
639 status
->valid
= ret
== 0;
642 /* Leave a critical section. */
644 cs_leave (struct cs_status
const *status
)
648 /* Ignore failure when restoring the signal mask. */
649 pthread_sigmask (SIG_SETMASK
, &status
->sigs
, nullptr);
653 /* Possible states for a temp file. If compressed, the file's status
654 is unreaped or reaped, depending on whether 'sort' has waited for
655 the subprocess to finish. */
656 enum { UNCOMPRESSED
, UNREAPED
, REAPED
};
658 /* The list of temporary files. */
661 struct tempnode
*volatile next
;
662 pid_t pid
; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
664 char name
[FLEXIBLE_ARRAY_MEMBER
];
666 static struct tempnode
*volatile temphead
;
667 static struct tempnode
*volatile *temptail
= &temphead
;
669 /* A file to be sorted. */
672 /* The file's name. */
675 /* Non-null if this is a temporary file, in which case NAME == TEMP->name. */
676 struct tempnode
*temp
;
679 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
680 static Hash_table
*proctab
;
682 enum { INIT_PROCTAB_SIZE
= 47 };
685 proctab_hasher (void const *entry
, size_t tabsize
)
687 struct tempnode
const *node
= entry
;
688 return node
->pid
% tabsize
;
692 proctab_comparator (void const *e1
, void const *e2
)
694 struct tempnode
const *n1
= e1
;
695 struct tempnode
const *n2
= e2
;
696 return n1
->pid
== n2
->pid
;
699 /* The number of unreaped child processes. */
702 static bool delete_proc (pid_t
);
704 /* If PID is positive, wait for the child process with that PID to
705 exit, and assume that PID has already been removed from the process
706 table. If PID is 0 or -1, clean up some child that has exited (by
707 waiting for it, and removing it from the proc table) and return the
708 child's process ID. However, if PID is 0 and no children have
709 exited, return 0 without waiting. */
715 pid_t cpid
= waitpid ((pid
? pid
: -1), &status
, (pid
? 0 : WNOHANG
));
718 error (SORT_FAILURE
, errno
, _("waiting for %s [-d]"),
719 quoteaf (compress_program
));
720 else if (0 < cpid
&& (0 < pid
|| delete_proc (cpid
)))
722 if (! WIFEXITED (status
) || WEXITSTATUS (status
))
723 error (SORT_FAILURE
, 0, _("%s [-d] terminated abnormally"),
724 quoteaf (compress_program
));
731 /* TEMP represents a new process; add it to the process table. Create
732 the process table the first time it's called. */
735 register_proc (struct tempnode
*temp
)
739 proctab
= hash_initialize (INIT_PROCTAB_SIZE
, nullptr,
747 temp
->state
= UNREAPED
;
749 if (! hash_insert (proctab
, temp
))
753 /* If PID is in the process table, remove it and return true.
754 Otherwise, return false. */
757 delete_proc (pid_t pid
)
759 struct tempnode test
;
762 struct tempnode
*node
= hash_remove (proctab
, &test
);
765 node
->state
= REAPED
;
769 /* Remove PID from the process table, and wait for it to exit if it
773 wait_proc (pid_t pid
)
775 if (delete_proc (pid
))
779 /* Reap any exited children. Do not block; reap only those that have
785 while (0 < nprocs
&& reap (0))
789 /* Reap at least one exited child, waiting if necessary. */
798 /* Reap all children, waiting if necessary. */
807 /* Clean up any remaining temporary files. */
812 struct tempnode
const *node
;
814 for (node
= temphead
; node
; node
= node
->next
)
819 /* Cleanup actions to take when exiting. */
826 /* Clean up any remaining temporary files in a critical section so
827 that a signal handler does not try to clean them too. */
837 /* Create a new temporary file, returning its newly allocated tempnode.
838 Store into *PFD the file descriptor open for writing.
839 If the creation fails, return nullptr and store -1 into *PFD if the
840 failure is due to file descriptor exhaustion and
841 SURVIVE_FD_EXHAUSTION; otherwise, die. */
843 static struct tempnode
*
844 create_temp_file (int *pfd
, bool survive_fd_exhaustion
)
846 static char const slashbase
[] = "/sortXXXXXX";
847 static size_t temp_dir_index
;
850 char const *temp_dir
= temp_dirs
[temp_dir_index
];
851 size_t len
= strlen (temp_dir
);
852 struct tempnode
*node
=
853 xmalloc (FLEXSIZEOF (struct tempnode
, name
, len
+ sizeof slashbase
));
854 char *file
= node
->name
;
857 memcpy (file
, temp_dir
, len
);
858 memcpy (file
+ len
, slashbase
, sizeof slashbase
);
859 node
->next
= nullptr;
860 if (++temp_dir_index
== temp_dir_count
)
863 /* Create the temporary file in a critical section, to avoid races. */
865 fd
= mkostemp (file
, O_CLOEXEC
);
869 temptail
= &node
->next
;
877 if (! (survive_fd_exhaustion
&& errno
== EMFILE
))
878 error (SORT_FAILURE
, errno
, _("cannot create temporary file in %s"),
888 /* Return a pointer to stdout status, or nullptr on failure. */
893 static int outstat_errno
;
894 static struct stat outstat
;
895 if (outstat_errno
== 0)
896 outstat_errno
= fstat (STDOUT_FILENO
, &outstat
) == 0 ? -1 : errno
;
897 return outstat_errno
< 0 ? &outstat
: nullptr;
900 /* Return a stream for FILE, opened with mode HOW. If HOW is "w",
901 the file is already open on standard output, and needs to be
902 truncated unless FILE is null. When opening for input, "-"
903 means standard input. To avoid confusion, do not return file
904 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
905 opening an ordinary FILE. Return nullptr if unsuccessful.
907 Use fadvise to specify an access pattern for input files.
908 There are a few hints we could possibly provide,
909 and after careful testing it was decided that
910 specifying FADVISE_SEQUENTIAL was not detrimental
911 to any cases. On Linux 2.6.31, this option doubles
912 the size of read ahead performed and thus was seen to
915 Sorting with a smaller internal buffer
916 Reading from faster flash devices
918 In _addition_ one could also specify other hints...
920 FADVISE_WILLNEED was tested, but Linux 2.6.31
921 at least uses that to _synchronously_ prepopulate the cache
922 with the specified range. While sort does need to
923 read all of its input before outputting, a synchronous
924 read of the whole file up front precludes any processing
925 that sort could do in parallel with the system doing
926 read ahead of the data. This was seen to have negative effects
927 in a couple of cases:
929 Sorting with a smaller internal buffer
930 This option was seen to shorten the runtime for sort
931 on a multicore system with lots of RAM and other processes
932 competing for CPU. It could be argued that more explicit
933 scheduling hints with 'nice' et. al. are more appropriate
936 FADVISE_NOREUSE is a possibility as it could lower
937 the priority of input data in the cache as sort will
938 only need to process it once. However its functionality
939 has changed over Linux kernel versions and as of 2.6.31
940 it does nothing and thus we can't depend on what it might
943 FADVISE_DONTNEED is not appropriate for user specified
944 input files, but for temp files we do want to drop the
945 cache immediately after processing. This is done implicitly
946 however when the files are unlinked. */
949 stream_open (char const *file
, char const *how
)
955 if (STREQ (file
, "-"))
957 have_read_stdin
= true;
962 int fd
= open (file
, O_RDONLY
| O_CLOEXEC
);
963 fp
= fd
< 0 ? nullptr : fdopen (fd
, how
);
965 fadvise (fp
, FADVISE_SEQUENTIAL
);
967 else if (*how
== 'w')
969 if (file
&& ftruncate (STDOUT_FILENO
, 0) != 0)
971 int ftruncate_errno
= errno
;
972 struct stat
*outst
= get_outstatus ();
973 if (!outst
|| S_ISREG (outst
->st_mode
) || S_TYPEISSHM (outst
))
974 error (SORT_FAILURE
, ftruncate_errno
, _("%s: error truncating"),
980 affirm (!"unexpected mode passed to stream_open");
985 /* Same as stream_open, except always return a non-null value; die on
989 xfopen (char const *file
, char const *how
)
991 FILE *fp
= stream_open (file
, how
);
993 sort_die (_("open failed"), file
);
997 /* Close FP, whose name is FILE, and report any errors. */
1000 xfclose (FILE *fp
, char const *file
)
1002 switch (fileno (fp
))
1005 /* Allow reading stdin from tty more than once. */
1010 /* Don't close stdout just yet. close_stdout does that. */
1011 if (fflush (fp
) != 0)
1012 sort_die (_("fflush failed"), file
);
1016 if (fclose (fp
) != 0)
1017 sort_die (_("close failed"), file
);
1022 /* Move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not close-on-exec. */
1025 move_fd (int oldfd
, int newfd
)
1029 /* These should never fail for our usage. */
1030 ignore_value (dup2 (oldfd
, newfd
));
1031 ignore_value (close (oldfd
));
1035 /* Fork a child process for piping to and do common cleanup. The
1036 TRIES parameter specifies how many times to try to fork before
1037 giving up. Return the PID of the child, or -1 (setting errno)
1041 pipe_fork (int pipefds
[2], size_t tries
)
1043 #if HAVE_WORKING_FORK
1044 struct tempnode
*saved_temphead
;
1046 double wait_retry
= 0.25;
1048 struct cs_status cs
;
1050 if (pipe2 (pipefds
, O_CLOEXEC
) < 0)
1053 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1054 uncontrolled subprocess generation can hurt performance significantly.
1055 Allow at most NMERGE + 2 subprocesses, on the theory that there
1056 may be some useful parallelism by letting compression for the
1057 previous merge finish (1 subprocess) in parallel with the current
1058 merge (NMERGE + 1 subprocesses). */
1060 if (nmerge
+ 1 < nprocs
)
1065 /* This is so the child process won't delete our temp files
1066 if it receives a signal before exec-ing. */
1068 saved_temphead
= temphead
;
1072 saved_errno
= errno
;
1074 temphead
= saved_temphead
;
1077 errno
= saved_errno
;
1079 if (0 <= pid
|| errno
!= EAGAIN
)
1083 xnanosleep (wait_retry
);
1091 saved_errno
= errno
;
1094 errno
= saved_errno
;
1098 close (STDIN_FILENO
);
1099 close (STDOUT_FILENO
);
1106 #else /* ! HAVE_WORKING_FORK */
1111 /* Create a temporary file and, if asked for, start a compressor
1112 to that file. Set *PFP to the file handle and return
1113 the address of the new temp node. If the creation
1114 fails, return nullptr if the failure is due to file descriptor
1115 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1117 static struct tempnode
*
1118 maybe_create_temp (FILE **pfp
, bool survive_fd_exhaustion
)
1121 struct tempnode
*node
= create_temp_file (&tempfd
, survive_fd_exhaustion
);
1125 node
->state
= UNCOMPRESSED
;
1127 if (compress_program
)
1131 node
->pid
= pipe_fork (pipefds
, MAX_FORK_TRIES_COMPRESS
);
1136 tempfd
= pipefds
[1];
1138 register_proc (node
);
1140 else if (node
->pid
== 0)
1142 /* Being the child of a multithreaded program before exec,
1143 we're restricted to calling async-signal-safe routines here. */
1145 move_fd (tempfd
, STDOUT_FILENO
);
1146 move_fd (pipefds
[0], STDIN_FILENO
);
1148 execlp (compress_program
, compress_program
, (char *) nullptr);
1150 async_safe_die (errno
, "couldn't execute compress program");
1154 *pfp
= fdopen (tempfd
, "w");
1156 sort_die (_("couldn't create temporary file"), node
->name
);
1161 /* Create a temporary file and, if asked for, start a compressor
1162 to that file. Set *PFP to the file handle and return the address
1163 of the new temp node. Die on failure. */
1165 static struct tempnode
*
1166 create_temp (FILE **pfp
)
1168 return maybe_create_temp (pfp
, false);
1171 /* Open a compressed temp file and start a decompression process through
1172 which to filter the input. Return nullptr (setting errno to
1173 EMFILE) if we ran out of file descriptors, and die on any other
1177 open_temp (struct tempnode
*temp
)
1179 int tempfd
, pipefds
[2];
1182 if (temp
->state
== UNREAPED
)
1183 wait_proc (temp
->pid
);
1185 tempfd
= open (temp
->name
, O_RDONLY
);
1189 pid_t child
= pipe_fork (pipefds
, MAX_FORK_TRIES_DECOMPRESS
);
1194 if (errno
!= EMFILE
)
1195 error (SORT_FAILURE
, errno
, _("couldn't create process for %s -d"),
1196 quoteaf (compress_program
));
1202 /* Being the child of a multithreaded program before exec,
1203 we're restricted to calling async-signal-safe routines here. */
1205 move_fd (tempfd
, STDIN_FILENO
);
1206 move_fd (pipefds
[1], STDOUT_FILENO
);
1208 execlp (compress_program
, compress_program
, "-d", (char *) nullptr);
1210 async_safe_die (errno
, "couldn't execute compress program (with -d)");
1214 register_proc (temp
);
1218 fp
= fdopen (pipefds
[0], "r");
1221 int saved_errno
= errno
;
1223 errno
= saved_errno
;
1231 /* Append DIR to the array of temporary directory names. */
1233 add_temp_dir (char const *dir
)
1235 if (temp_dir_count
== temp_dir_alloc
)
1236 temp_dirs
= X2NREALLOC (temp_dirs
, &temp_dir_alloc
);
1238 temp_dirs
[temp_dir_count
++] = dir
;
1241 /* Remove NAME from the list of temporary files. */
1244 zaptemp (char const *name
)
1246 struct tempnode
*volatile *pnode
;
1247 struct tempnode
*node
;
1248 struct tempnode
*next
;
1250 int unlink_errno
= 0;
1251 struct cs_status cs
;
1253 for (pnode
= &temphead
; (node
= *pnode
)->name
!= name
; pnode
= &node
->next
)
1256 if (node
->state
== UNREAPED
)
1257 wait_proc (node
->pid
);
1259 /* Unlink the temporary file in a critical section to avoid races. */
1262 unlink_status
= unlink (name
);
1263 unlink_errno
= errno
;
1267 if (unlink_status
!= 0)
1268 error (0, unlink_errno
, _("warning: cannot remove: %s"), quotef (name
));
1274 #if HAVE_NL_LANGINFO
1277 struct_month_cmp (void const *m1
, void const *m2
)
1279 struct month
const *month1
= m1
;
1280 struct month
const *month2
= m2
;
1281 return strcmp (month1
->name
, month2
->name
);
1286 /* Initialize the character class tables. */
1293 for (i
= 0; i
< UCHAR_LIM
; ++i
)
1295 blanks
[i
] = i
== '\n' || isblank (i
);
1296 nondictionary
[i
] = ! blanks
[i
] && ! isalnum (i
);
1297 nonprinting
[i
] = ! isprint (i
);
1298 fold_toupper
[i
] = toupper (i
);
1301 #if HAVE_NL_LANGINFO
1302 /* If we're not in the "C" locale, read different names for months. */
1305 for (i
= 0; i
< MONTHS_PER_YEAR
; i
++)
1312 s
= nl_langinfo (ABMON_1
+ i
);
1314 monthtab
[i
].name
= name
= xmalloc (s_len
+ 1);
1315 monthtab
[i
].val
= i
+ 1;
1317 for (j
= k
= 0; j
< s_len
; j
++)
1318 if (! isblank (to_uchar (s
[j
])))
1319 name
[k
++] = fold_toupper
[to_uchar (s
[j
])];
1322 qsort (monthtab
, MONTHS_PER_YEAR
, sizeof *monthtab
, struct_month_cmp
);
1327 /* Specify how many inputs may be merged at once.
1328 This may be set on the command-line with the
1329 --batch-size option. */
1331 specify_nmerge (int oi
, char c
, char const *s
)
1334 struct rlimit rlimit
;
1335 enum strtol_error e
= xstrtoumax (s
, nullptr, 10, &n
, "");
1337 /* Try to find out how many file descriptors we'll be able
1338 to open. We need at least nmerge + 3 (STDIN_FILENO,
1339 STDOUT_FILENO and STDERR_FILENO). */
1340 unsigned int max_nmerge
= ((getrlimit (RLIMIT_NOFILE
, &rlimit
) == 0
1345 if (e
== LONGINT_OK
)
1349 e
= LONGINT_OVERFLOW
;
1354 error (0, 0, _("invalid --%s argument %s"),
1355 long_options
[oi
].name
, quote (s
));
1356 error (SORT_FAILURE
, 0,
1357 _("minimum --%s argument is %s"),
1358 long_options
[oi
].name
, quote ("2"));
1360 else if (max_nmerge
< nmerge
)
1362 e
= LONGINT_OVERFLOW
;
1369 if (e
== LONGINT_OVERFLOW
)
1371 char max_nmerge_buf
[INT_BUFSIZE_BOUND (max_nmerge
)];
1372 error (0, 0, _("--%s argument %s too large"),
1373 long_options
[oi
].name
, quote (s
));
1374 error (SORT_FAILURE
, 0,
1375 _("maximum --%s argument with current rlimit is %s"),
1376 long_options
[oi
].name
,
1377 uinttostr (max_nmerge
, max_nmerge_buf
));
1380 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1383 /* Specify the amount of main memory to use when sorting. */
1385 specify_sort_size (int oi
, char c
, char const *s
)
1389 enum strtol_error e
= xstrtoumax (s
, &suffix
, 10, &n
, "EgGkKmMPQRtTYZ");
1391 /* The default unit is KiB. */
1392 if (e
== LONGINT_OK
&& ISDIGIT (suffix
[-1]))
1394 if (n
<= UINTMAX_MAX
/ 1024)
1397 e
= LONGINT_OVERFLOW
;
1400 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1401 if (e
== LONGINT_INVALID_SUFFIX_CHAR
&& ISDIGIT (suffix
[-1]) && ! suffix
[1])
1410 double mem
= physmem_total () * n
/ 100;
1412 /* Use "<", not "<=", to avoid problems with rounding. */
1413 if (mem
< UINTMAX_MAX
)
1419 e
= LONGINT_OVERFLOW
;
1424 if (e
== LONGINT_OK
)
1426 /* If multiple sort sizes are specified, take the maximum, so
1427 that option order does not matter. */
1434 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
1438 e
= LONGINT_OVERFLOW
;
1441 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1444 /* Specify the number of threads to spawn during internal sort. */
1446 specify_nthreads (int oi
, char c
, char const *s
)
1449 enum strtol_error e
= xstrtoumax (s
, nullptr, 10, &nthreads
, "");
1450 if (e
== LONGINT_OVERFLOW
)
1452 if (e
!= LONGINT_OK
)
1453 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1454 if (SIZE_MAX
< nthreads
)
1455 nthreads
= SIZE_MAX
;
1457 error (SORT_FAILURE
, 0, _("number in parallel must be nonzero"));
1461 /* Return the default sort size. */
1463 default_sort_size (void)
1465 /* Let SIZE be MEM, but no more than the maximum object size,
1466 total memory, or system resource limits. Don't bother to check
1467 for values like RLIM_INFINITY since in practice they are not much
1468 less than SIZE_MAX. */
1469 size_t size
= SIZE_MAX
;
1470 struct rlimit rlimit
;
1471 if (getrlimit (RLIMIT_DATA
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1472 size
= rlimit
.rlim_cur
;
1474 if (getrlimit (RLIMIT_AS
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1475 size
= rlimit
.rlim_cur
;
1478 /* Leave a large safety margin for the above limits, as failure can
1479 occur when they are exceeded. */
1483 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1484 Exceeding RSS is not fatal, but can be quite slow. */
1485 if (getrlimit (RLIMIT_RSS
, &rlimit
) == 0 && rlimit
.rlim_cur
/ 16 * 15 < size
)
1486 size
= rlimit
.rlim_cur
/ 16 * 15;
1489 /* Let MEM be available memory or 1/8 of total memory, whichever
1491 double avail
= physmem_available ();
1492 double total
= physmem_total ();
1493 double mem
= MAX (avail
, total
/ 8);
1495 /* Leave a 1/4 margin for physical memory. */
1496 if (total
* 0.75 < size
)
1497 size
= total
* 0.75;
1499 /* Return the minimum of MEM and SIZE, but no less than
1500 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1501 right when only one argument is floating point. */
1504 return MAX (size
, MIN_SORT_SIZE
);
1507 /* Return the sort buffer size to use with the input files identified
1508 by FPS and FILES, which are alternate names of the same files.
1509 NFILES gives the number of input files; NFPS may be less. Assume
1510 that each input line requires LINE_BYTES extra bytes' worth of line
1511 information. Do not exceed the size bound specified by the user
1512 (or a default size bound, if the user does not specify one). */
1515 sort_buffer_size (FILE *const *fps
, size_t nfps
,
1516 char *const *files
, size_t nfiles
,
1519 /* A bound on the input size. If zero, the bound hasn't been
1521 static size_t size_bound
;
1523 /* In the worst case, each input byte is a newline. */
1524 size_t worst_case_per_input_byte
= line_bytes
+ 1;
1526 /* Keep enough room for one extra input line and an extra byte.
1527 This extra room might be needed when preparing to read EOF. */
1528 size_t size
= worst_case_per_input_byte
+ 1;
1530 for (size_t i
= 0; i
< nfiles
; i
++)
1536 if ((i
< nfps
? fstat (fileno (fps
[i
]), &st
)
1537 : STREQ (files
[i
], "-") ? fstat (STDIN_FILENO
, &st
)
1538 : stat (files
[i
], &st
))
1540 sort_die (_("stat failed"), files
[i
]);
1542 if (S_ISREG (st
.st_mode
))
1543 file_size
= st
.st_size
;
1546 /* The file has unknown size. If the user specified a sort
1547 buffer size, use that; otherwise, guess the size. */
1550 file_size
= INPUT_FILE_SIZE_GUESS
;
1555 size_bound
= sort_size
;
1557 size_bound
= default_sort_size ();
1560 /* Add the amount of memory needed to represent the worst case
1561 where the input consists entirely of newlines followed by a
1562 single non-newline. Check for overflow. */
1563 worst_case
= file_size
* worst_case_per_input_byte
+ 1;
1564 if (file_size
!= worst_case
/ worst_case_per_input_byte
1565 || size_bound
- size
<= worst_case
)
1573 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1574 must be at least sizeof (struct line). Allocate ALLOC bytes
1578 initbuf (struct buffer
*buf
, size_t line_bytes
, size_t alloc
)
1580 /* Ensure that the line array is properly aligned. If the desired
1581 size cannot be allocated, repeatedly halve it until allocation
1582 succeeds. The smaller allocation may hurt overall performance,
1583 but that's better than failing. */
1586 alloc
+= sizeof (struct line
) - alloc
% sizeof (struct line
);
1587 buf
->buf
= malloc (alloc
);
1591 if (alloc
<= line_bytes
+ 1)
1595 buf
->line_bytes
= line_bytes
;
1597 buf
->used
= buf
->left
= buf
->nlines
= 0;
1601 /* Return one past the limit of the line array. */
1603 static inline struct line
*
1604 buffer_linelim (struct buffer
const *buf
)
1606 void *linelim
= buf
->buf
+ buf
->alloc
;
1610 /* Return a pointer to the first character of the field specified
1614 begfield (struct line
const *line
, struct keyfield
const *key
)
1616 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1617 size_t sword
= key
->sword
;
1618 size_t schar
= key
->schar
;
1620 /* The leading field separator itself is included in a field when -t
1623 if (tab
!= TAB_DEFAULT
)
1624 while (ptr
< lim
&& sword
--)
1626 while (ptr
< lim
&& *ptr
!= tab
)
1632 while (ptr
< lim
&& sword
--)
1634 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1636 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1640 /* If we're ignoring leading blanks when computing the Start
1641 of the field, skip past them here. */
1642 if (key
->skipsblanks
)
1643 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1646 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1647 ptr
= MIN (lim
, ptr
+ schar
);
1652 /* Return the limit of (a pointer to the first character after) the field
1653 in LINE specified by KEY. */
1657 limfield (struct line
const *line
, struct keyfield
const *key
)
1659 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1660 size_t eword
= key
->eword
, echar
= key
->echar
;
1663 eword
++; /* Skip all of end field. */
1665 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1666 whichever comes first. If there are more than EWORD fields, leave
1667 PTR pointing at the beginning of the field having zero-based index,
1668 EWORD. If a delimiter character was specified (via -t), then that
1669 'beginning' is the first character following the delimiting TAB.
1670 Otherwise, leave PTR pointing at the first 'blank' character after
1671 the preceding field. */
1672 if (tab
!= TAB_DEFAULT
)
1673 while (ptr
< lim
&& eword
--)
1675 while (ptr
< lim
&& *ptr
!= tab
)
1677 if (ptr
< lim
&& (eword
|| echar
))
1681 while (ptr
< lim
&& eword
--)
1683 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1685 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1689 #ifdef POSIX_UNSPECIFIED
1690 /* The following block of code makes GNU sort incompatible with
1691 standard Unix sort, so it's ifdef'd out for now.
1692 The POSIX spec isn't clear on how to interpret this.
1693 FIXME: request clarification.
1695 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1696 Date: Thu, 30 May 96 12:20:41 -0400
1697 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1699 [...]I believe I've found another bug in 'sort'.
1704 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1707 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1711 Unix sort produced the answer I expected: sort on the single character
1712 in column 7. GNU sort produced different results, because it disagrees
1713 on the interpretation of the key-end spec "M.N". Unix sort reads this
1714 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1715 "skip M-1 fields, then either N-1 characters or the rest of the current
1716 field, whichever comes first". This extra clause applies only to
1717 key-ends, not key-starts.
1720 /* Make LIM point to the end of (one byte past) the current field. */
1721 if (tab
!= TAB_DEFAULT
)
1724 newlim
= memchr (ptr
, tab
, lim
- ptr
);
1732 while (newlim
< lim
&& blanks
[to_uchar (*newlim
)])
1734 while (newlim
< lim
&& !blanks
[to_uchar (*newlim
)])
1740 if (echar
!= 0) /* We need to skip over a portion of the end field. */
1742 /* If we're ignoring leading blanks when computing the End
1743 of the field, skip past them here. */
1744 if (key
->skipeblanks
)
1745 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1748 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1749 ptr
= MIN (lim
, ptr
+ echar
);
1755 /* Fill BUF reading from FP, moving buf->left bytes from the end
1756 of buf->buf to the beginning first. If EOF is reached and the
1757 file wasn't terminated by a newline, supply one. Set up BUF's line
1758 table too. FILE is the name of the file corresponding to FP.
1759 Return true if some input was read. */
1762 fillbuf (struct buffer
*buf
, FILE *fp
, char const *file
)
1764 struct keyfield
const *key
= keylist
;
1766 size_t line_bytes
= buf
->line_bytes
;
1767 size_t mergesize
= merge_buffer_size
- MIN_MERGE_BUFFER_SIZE
;
1772 if (buf
->used
!= buf
->left
)
1774 memmove (buf
->buf
, buf
->buf
+ buf
->used
- buf
->left
, buf
->left
);
1775 buf
->used
= buf
->left
;
1781 char *ptr
= buf
->buf
+ buf
->used
;
1782 struct line
*linelim
= buffer_linelim (buf
);
1783 struct line
*line
= linelim
- buf
->nlines
;
1784 size_t avail
= (char *) linelim
- buf
->nlines
* line_bytes
- ptr
;
1785 char *line_start
= buf
->nlines
? line
->text
+ line
->length
: buf
->buf
;
1787 while (line_bytes
+ 1 < avail
)
1789 /* Read as many bytes as possible, but do not read so many
1790 bytes that there might not be enough room for the
1791 corresponding line array. The worst case is when the
1792 rest of the input file consists entirely of newlines,
1793 except that the last byte is not a newline. */
1794 size_t readsize
= (avail
- 1) / (line_bytes
+ 1);
1795 size_t bytes_read
= fread (ptr
, 1, readsize
, fp
);
1796 char *ptrlim
= ptr
+ bytes_read
;
1798 avail
-= bytes_read
;
1800 if (bytes_read
!= readsize
)
1803 sort_die (_("read failed"), file
);
1807 if (buf
->buf
== ptrlim
)
1809 if (line_start
!= ptrlim
&& ptrlim
[-1] != eol
)
1814 /* Find and record each line in the just-read input. */
1815 while ((p
= memchr (ptr
, eol
, ptrlim
- ptr
)))
1817 /* Delimit the line with NUL. This eliminates the need to
1818 temporarily replace the last byte with NUL when calling
1819 xmemcoll, which increases performance. */
1823 line
->text
= line_start
;
1824 line
->length
= ptr
- line_start
;
1825 mergesize
= MAX (mergesize
, line
->length
);
1826 avail
-= line_bytes
;
1830 /* Precompute the position of the first key for
1832 line
->keylim
= (key
->eword
== SIZE_MAX
1834 : limfield (line
, key
));
1836 if (key
->sword
!= SIZE_MAX
)
1837 line
->keybeg
= begfield (line
, key
);
1840 if (key
->skipsblanks
)
1841 while (blanks
[to_uchar (*line_start
)])
1843 line
->keybeg
= line_start
;
1855 buf
->used
= ptr
- buf
->buf
;
1856 buf
->nlines
= buffer_linelim (buf
) - line
;
1857 if (buf
->nlines
!= 0)
1859 buf
->left
= ptr
- line_start
;
1860 merge_buffer_size
= mergesize
+ MIN_MERGE_BUFFER_SIZE
;
1865 /* The current input line is too long to fit in the buffer.
1866 Increase the buffer size and try again, keeping it properly
1868 size_t line_alloc
= buf
->alloc
/ sizeof (struct line
);
1869 buf
->buf
= x2nrealloc (buf
->buf
, &line_alloc
, sizeof (struct line
));
1870 buf
->alloc
= line_alloc
* sizeof (struct line
);
1875 /* Table that maps characters to order-of-magnitude values. */
1876 static char const unit_order
[UCHAR_LIM
] =
1878 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1879 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'R' == 82 && 'Q' == 81 \
1881 /* This initializer syntax works on all C99 hosts. For now, use
1882 it only on non-ASCII hosts, to ease the pain of porting to
1883 pre-C99 ASCII hosts. */
1884 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1888 /* Generate the following table with this command:
1889 perl -e 'my %a=(k=>1,K=>1,M=>2,G=>3,T=>4,P=>5,E=>6,Z=>7,Y=>8,R=>9,Q=>10);
1890 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1895 0, 0, 0, 1, 0, 2, 0, 0, 5, 10, 9, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1906 /* Traverse number given as *number consisting of digits, thousands_sep, and
1907 decimal_point chars only. Returns the highest digit found in the number,
1908 or '\0' if no digit has been found. Upon return *number points at the
1909 character that immediately follows after the given number. */
1911 traverse_raw_number (char const **number
)
1913 char const *p
= *number
;
1915 char max_digit
= '\0';
1916 bool ends_with_thousands_sep
= false;
1918 /* Scan to end of number.
1919 Decimals or separators not followed by digits stop the scan.
1920 Numbers ending in decimals or separators are thus considered
1921 to be lacking in units.
1922 FIXME: add support for multibyte thousands_sep and decimal_point. */
1924 while (ISDIGIT (ch
= *p
++))
1929 /* Allow to skip only one occurrence of thousands_sep to avoid finding
1930 the unit in the next column in case thousands_sep matches as blank
1931 and is used as column delimiter. */
1932 ends_with_thousands_sep
= (*p
== thousands_sep
);
1933 if (ends_with_thousands_sep
)
1937 if (ends_with_thousands_sep
)
1939 /* thousands_sep not followed by digit is not allowed. */
1944 if (ch
== decimal_point
)
1945 while (ISDIGIT (ch
= *p
++))
1953 /* Return an integer that represents the order of magnitude of the
1954 unit following the number. The number may contain thousands
1955 separators and a decimal point, but it may not contain leading blanks.
1956 Negative numbers get negative orders; zero numbers have a zero order. */
1960 find_unit_order (char const *number
)
1962 bool minus_sign
= (*number
== '-');
1963 char const *p
= number
+ minus_sign
;
1964 char max_digit
= traverse_raw_number (&p
);
1965 if ('0' < max_digit
)
1967 unsigned char ch
= *p
;
1968 int order
= unit_order
[ch
];
1969 return (minus_sign
? -order
: order
);
1975 /* Compare numbers A and B ending in units with SI or IEC prefixes
1976 <none/unknown> < K/k < M < G < T < P < E < Z < Y < R < Q */
1980 human_numcompare (char const *a
, char const *b
)
1982 while (blanks
[to_uchar (*a
)])
1984 while (blanks
[to_uchar (*b
)])
1987 int diff
= find_unit_order (a
) - find_unit_order (b
);
1988 return (diff
? diff
: strnumcmp (a
, b
, decimal_point
, thousands_sep
));
1991 /* Compare strings A and B as numbers without explicitly converting them to
1992 machine numbers. Comparatively slow for short strings, but asymptotically
1997 numcompare (char const *a
, char const *b
)
1999 while (blanks
[to_uchar (*a
)])
2001 while (blanks
[to_uchar (*b
)])
2004 return strnumcmp (a
, b
, decimal_point
, thousands_sep
);
2008 nan_compare (long double a
, long double b
)
2010 char buf
[2][sizeof "-nan""()" + CHAR_BIT
* sizeof a
];
2011 snprintf (buf
[0], sizeof buf
[0], "%Lf", a
);
2012 snprintf (buf
[1], sizeof buf
[1], "%Lf", b
);
2013 return strcmp (buf
[0], buf
[1]);
2017 general_numcompare (char const *sa
, char const *sb
)
2019 /* FIXME: maybe add option to try expensive FP conversion
2020 only if A and B can't be compared more cheaply/accurately. */
2024 long double a
= strtold (sa
, &ea
);
2025 long double b
= strtold (sb
, &eb
);
2027 /* Put conversion errors at the start of the collating sequence. */
2029 return sb
== eb
? 0 : -1;
2033 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
2034 conversion errors but before numbers; sort them by internal
2035 bit-pattern, for lack of a more portable alternative. */
2041 : nan_compare (a
, b
));
2044 /* Return an integer in 1..12 of the month name MONTH.
2045 Return 0 if the name in S is not recognized. */
2048 getmonth (char const *month
, char **ea
)
2051 size_t hi
= MONTHS_PER_YEAR
;
2053 while (blanks
[to_uchar (*month
)])
2058 size_t ix
= (lo
+ hi
) / 2;
2059 char const *m
= month
;
2060 char const *n
= monthtab
[ix
].name
;
2068 return monthtab
[ix
].val
;
2070 if (to_uchar (fold_toupper
[to_uchar (*m
)]) < to_uchar (*n
))
2075 else if (to_uchar (fold_toupper
[to_uchar (*m
)]) > to_uchar (*n
))
2087 /* When using the OpenSSL implementation, dynamically link only if -R.
2088 This saves startup time in the usual (sans -R) case. */
2090 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2091 /* In the typical case where md5.h does not #undef HAVE_OPENSSL_MD5,
2092 trick md5.h into declaring and using pointers to functions not functions.
2093 This causes the compiler's -lcrypto option to have no effect,
2094 as sort.o no longer uses any crypto symbols statically. */
2095 # define MD5_Init (*ptr_MD5_Init)
2096 # define MD5_Update (*ptr_MD5_Update)
2097 # define MD5_Final (*ptr_MD5_Final)
2102 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2105 /* Diagnose a dynamic linking failure. */
2109 error (SORT_FAILURE
, 0, "%s", dlerror ());
2112 /* Return a function pointer in HANDLE for SYMBOL. */
2114 symbol_address (void *handle
, char const *symbol
)
2116 void *address
= dlsym (handle
, symbol
);
2123 /* Dynamically link the crypto library, if it needs linking. */
2125 link_libcrypto (void)
2127 #if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
2128 void *handle
= dlopen (LIBCRYPTO_SONAME
, RTLD_LAZY
| RTLD_GLOBAL
);
2131 ptr_MD5_Init
= symbol_address (handle
, "MD5_Init");
2132 ptr_MD5_Update
= symbol_address (handle
, "MD5_Update");
2133 ptr_MD5_Final
= symbol_address (handle
, "MD5_Final");
2137 /* A randomly chosen MD5 state, used for random comparison. */
2138 static struct md5_ctx random_md5_state
;
2140 /* Initialize the randomly chosen MD5 state. */
2143 random_md5_state_init (char const *random_source
)
2145 unsigned char buf
[MD5_DIGEST_SIZE
];
2146 struct randread_source
*r
= randread_new (random_source
, sizeof buf
);
2148 sort_die (_("open failed"), random_source
? random_source
: "getrandom");
2149 randread (r
, buf
, sizeof buf
);
2150 if (randread_free (r
) != 0)
2151 sort_die (_("close failed"), random_source
);
2153 md5_init_ctx (&random_md5_state
);
2154 md5_process_bytes (buf
, sizeof buf
, &random_md5_state
);
2157 /* This is like strxfrm, except it reports any error and exits. */
2160 xstrxfrm (char *restrict dest
, char const *restrict src
, size_t destsize
)
2163 size_t translated_size
= strxfrm (dest
, src
, destsize
);
2167 error (0, errno
, _("string transformation failed"));
2168 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2169 error (SORT_FAILURE
, 0,
2170 _("the original string was %s"),
2171 quotearg_n_style (0, locale_quoting_style
, src
));
2174 return translated_size
;
2177 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2178 using one or more random hash functions. TEXTA[LENA] and
2179 TEXTB[LENB] must be zero. */
2182 compare_random (char *restrict texta
, size_t lena
,
2183 char *restrict textb
, size_t lenb
)
2185 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2186 data. This is used to break ties if there is a checksum
2187 collision, and this is good enough given the astronomically low
2188 probability of a collision. */
2191 char stackbuf
[4000];
2192 char *buf
= stackbuf
;
2193 size_t bufsize
= sizeof stackbuf
;
2194 void *allocated
= nullptr;
2195 uint32_t dig
[2][MD5_DIGEST_SIZE
/ sizeof (uint32_t)];
2196 struct md5_ctx s
[2];
2197 s
[0] = s
[1] = random_md5_state
;
2199 if (hard_LC_COLLATE
)
2201 char const *lima
= texta
+ lena
;
2202 char const *limb
= textb
+ lenb
;
2206 /* Transform the text into the basis of comparison, so that byte
2207 strings that would otherwise considered to be equal are
2208 considered equal here even if their bytes differ.
2210 Each time through this loop, transform one
2211 null-terminated string's worth from TEXTA or from TEXTB
2212 or both. That way, there's no need to store the
2213 transformation of the whole line, if it contains many
2214 null-terminated strings. */
2216 /* Store the transformed data into a big-enough buffer. */
2218 /* A 3X size guess avoids the overhead of calling strxfrm
2219 twice on typical implementations. Don't worry about
2220 size_t overflow, as the guess need not be correct. */
2221 size_t guess_bufsize
= 3 * (lena
+ lenb
) + 2;
2222 if (bufsize
< guess_bufsize
)
2224 bufsize
= MAX (guess_bufsize
, bufsize
* 3 / 2);
2226 buf
= allocated
= malloc (bufsize
);
2230 bufsize
= sizeof stackbuf
;
2235 (texta
< lima
? xstrxfrm (buf
, texta
, bufsize
) + 1 : 0);
2236 bool a_fits
= sizea
<= bufsize
;
2239 ? (xstrxfrm ((a_fits
? buf
+ sizea
: nullptr), textb
,
2240 (a_fits
? bufsize
- sizea
: 0))
2244 if (! (a_fits
&& sizea
+ sizeb
<= bufsize
))
2246 bufsize
= sizea
+ sizeb
;
2247 if (bufsize
< SIZE_MAX
/ 3)
2248 bufsize
= bufsize
* 3 / 2;
2250 buf
= allocated
= xmalloc (bufsize
);
2252 strxfrm (buf
, texta
, sizea
);
2254 strxfrm (buf
+ sizea
, textb
, sizeb
);
2257 /* Advance past NULs to the next part of each input string,
2258 exiting the loop if both strings are exhausted. When
2259 exiting the loop, prepare to finish off the tiebreaker
2260 comparison properly. */
2262 texta
+= strlen (texta
) + 1;
2264 textb
+= strlen (textb
) + 1;
2265 if (! (texta
< lima
|| textb
< limb
))
2267 lena
= sizea
; texta
= buf
;
2268 lenb
= sizeb
; textb
= buf
+ sizea
;
2272 /* Accumulate the transformed data in the corresponding
2274 md5_process_bytes (buf
, sizea
, &s
[0]);
2275 md5_process_bytes (buf
+ sizea
, sizeb
, &s
[1]);
2277 /* Update the tiebreaker comparison of the transformed data. */
2280 xfrm_diff
= memcmp (buf
, buf
+ sizea
, MIN (sizea
, sizeb
));
2282 xfrm_diff
= (sizea
> sizeb
) - (sizea
< sizeb
);
2287 /* Compute and compare the checksums. */
2288 md5_process_bytes (texta
, lena
, &s
[0]); md5_finish_ctx (&s
[0], dig
[0]);
2289 md5_process_bytes (textb
, lenb
, &s
[1]); md5_finish_ctx (&s
[1], dig
[1]);
2290 int diff
= memcmp (dig
[0], dig
[1], sizeof dig
[0]);
2292 /* Fall back on the tiebreaker if the checksums collide. */
2297 xfrm_diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
2299 xfrm_diff
= (lena
> lenb
) - (lena
< lenb
);
2310 /* Return the printable width of the block of memory starting at
2311 TEXT and ending just before LIM, counting each tab as one byte.
2312 FIXME: Should we generally be counting non printable chars? */
2315 debug_width (char const *text
, char const *lim
)
2317 size_t width
= mbsnwidth (text
, lim
- text
, 0);
2319 width
+= (*text
++ == '\t');
2323 /* For debug mode, "underline" a key at the
2324 specified offset and screen width. */
2327 mark_key (size_t offset
, size_t width
)
2333 printf (_("^ no match for key\n"));
2344 /* Return true if KEY is a numeric key. */
2347 key_numeric (struct keyfield
const *key
)
2349 return key
->numeric
|| key
->general_numeric
|| key
->human_numeric
;
2352 /* For LINE, output a debugging line that underlines KEY in LINE.
2353 If KEY is null, underline the whole line. */
2356 debug_key (struct line
const *line
, struct keyfield
const *key
)
2358 char *text
= line
->text
;
2360 char *lim
= text
+ line
->length
- 1;
2364 if (key
->sword
!= SIZE_MAX
)
2365 beg
= begfield (line
, key
);
2366 if (key
->eword
!= SIZE_MAX
)
2367 lim
= limfield (line
, key
);
2369 if ((key
->skipsblanks
&& key
->sword
== SIZE_MAX
)
2370 || key
->month
|| key_numeric (key
))
2375 while (blanks
[to_uchar (*beg
)])
2378 char *tighter_lim
= beg
;
2382 else if (key
->month
)
2383 getmonth (beg
, &tighter_lim
);
2384 else if (key
->general_numeric
)
2385 ignore_value (strtold (beg
, &tighter_lim
));
2386 else if (key
->numeric
|| key
->human_numeric
)
2388 char const *p
= beg
+ (beg
< lim
&& *beg
== '-');
2389 char max_digit
= traverse_raw_number (&p
);
2390 if ('0' <= max_digit
)
2392 unsigned char ch
= *p
;
2393 tighter_lim
= (char *) p
2394 + (key
->human_numeric
&& unit_order
[ch
]);
2405 size_t offset
= debug_width (text
, beg
);
2406 size_t width
= debug_width (beg
, lim
);
2407 mark_key (offset
, width
);
2410 /* Debug LINE by underlining its keys. */
2413 debug_line (struct line
const *line
)
2415 struct keyfield
const *key
= keylist
;
2418 debug_key (line
, key
);
2419 while (key
&& ((key
= key
->next
) || ! (unique
|| stable
)));
2422 /* Return whether sorting options specified for key. */
2425 default_key_compare (struct keyfield
const *key
)
2427 return ! (key
->ignore
2431 || key_numeric (key
)
2435 /* || key->reverse */
2439 /* Convert a key to the short options used to specify it. */
2442 key_to_opts (struct keyfield
const *key
, char *opts
)
2444 if (key
->skipsblanks
|| key
->skipeblanks
)
2445 *opts
++ = 'b';/* either disables global -b */
2446 if (key
->ignore
== nondictionary
)
2450 if (key
->general_numeric
)
2452 if (key
->human_numeric
)
2454 if (key
->ignore
== nonprinting
)
2469 /* Output data independent key warnings to stderr. */
2472 key_warnings (struct keyfield
const *gkey
, bool gkey_only
)
2474 struct keyfield
const *key
;
2475 struct keyfield ugkey
= *gkey
;
2476 unsigned long keynum
= 1;
2477 bool basic_numeric_field
= false;
2478 bool general_numeric_field
= false;
2479 bool basic_numeric_field_span
= false;
2480 bool general_numeric_field_span
= false;
2482 for (key
= keylist
; key
; key
= key
->next
, keynum
++)
2484 if (key_numeric (key
))
2486 if (key
->general_numeric
)
2487 general_numeric_field
= true;
2489 basic_numeric_field
= true;
2492 if (key
->traditional_used
)
2494 size_t sword
= key
->sword
;
2495 size_t eword
= key
->eword
;
2496 char tmp
[INT_BUFSIZE_BOUND (uintmax_t)];
2497 /* obsolescent syntax +A.x -B.y is equivalent to:
2498 -k A+1.x+1,B.y (when y = 0)
2499 -k A+1.x+1,B+1.y (when y > 0) */
2500 char obuf
[INT_BUFSIZE_BOUND (sword
) * 2 + 4]; /* +# -# */
2501 char nbuf
[INT_BUFSIZE_BOUND (sword
) * 2 + 5]; /* -k #,# */
2505 if (sword
== SIZE_MAX
)
2508 po
= stpcpy (stpcpy (po
, "+"), umaxtostr (sword
, tmp
));
2509 pn
= stpcpy (stpcpy (pn
, "-k "), umaxtostr (sword
+ 1, tmp
));
2510 if (key
->eword
!= SIZE_MAX
)
2512 stpcpy (stpcpy (po
, " -"), umaxtostr (eword
+ 1, tmp
));
2513 stpcpy (stpcpy (pn
, ","),
2514 umaxtostr (eword
+ 1
2515 + (key
->echar
== SIZE_MAX
), tmp
));
2517 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2518 quote_n (0, obuf
), quote_n (1, nbuf
));
2521 /* Warn about field specs that will never match. */
2522 bool zero_width
= key
->sword
!= SIZE_MAX
&& key
->eword
< key
->sword
;
2524 error (0, 0, _("key %lu has zero width and will be ignored"), keynum
);
2526 /* Warn about significant leading blanks. */
2527 bool implicit_skip
= key_numeric (key
) || key
->month
;
2528 bool line_offset
= key
->eword
== 0 && key
->echar
!= 0; /* -k1.x,1.y */
2529 if (!zero_width
&& !gkey_only
&& tab
== TAB_DEFAULT
&& !line_offset
2530 && ((!key
->skipsblanks
&& !implicit_skip
)
2531 || (!key
->skipsblanks
&& key
->schar
)
2532 || (!key
->skipeblanks
&& key
->echar
)))
2533 error (0, 0, _("leading blanks are significant in key %lu; "
2534 "consider also specifying 'b'"), keynum
);
2536 /* Warn about numeric comparisons spanning fields,
2537 as field delimiters could be interpreted as part
2538 of the number (maybe only in other locales). */
2539 if (!gkey_only
&& key_numeric (key
))
2541 size_t sword
= key
->sword
+ 1;
2542 size_t eword
= key
->eword
+ 1;
2545 if (!eword
|| sword
< eword
)
2547 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2549 if (key
->general_numeric
)
2550 general_numeric_field_span
= true;
2552 basic_numeric_field_span
= true;
2556 /* Flag global options not copied or specified in any key. */
2557 if (ugkey
.ignore
&& (ugkey
.ignore
== key
->ignore
))
2558 ugkey
.ignore
= nullptr;
2559 if (ugkey
.translate
&& (ugkey
.translate
== key
->translate
))
2560 ugkey
.translate
= nullptr;
2561 ugkey
.skipsblanks
&= !key
->skipsblanks
;
2562 ugkey
.skipeblanks
&= !key
->skipeblanks
;
2563 ugkey
.month
&= !key
->month
;
2564 ugkey
.numeric
&= !key
->numeric
;
2565 ugkey
.general_numeric
&= !key
->general_numeric
;
2566 ugkey
.human_numeric
&= !key
->human_numeric
;
2567 ugkey
.random
&= !key
->random
;
2568 ugkey
.version
&= !key
->version
;
2569 ugkey
.reverse
&= !key
->reverse
;
2572 /* Explicitly warn if field delimiters in this locale
2573 don't constrain numbers. */
2574 bool number_locale_warned
= false;
2575 if (basic_numeric_field_span
)
2577 if (tab
== TAB_DEFAULT
2578 ? thousands_sep
!= NON_CHAR
&& (isblank (to_uchar (thousands_sep
)))
2579 : tab
== thousands_sep
)
2582 _("field separator %s is treated as a "
2583 "group separator in numbers"),
2584 quote (((char []) {thousands_sep
, 0})));
2585 number_locale_warned
= true;
2588 if (basic_numeric_field_span
|| general_numeric_field_span
)
2590 if (tab
== TAB_DEFAULT
2591 ? thousands_sep
!= NON_CHAR
&& (isblank (to_uchar (decimal_point
)))
2592 : tab
== decimal_point
)
2595 _("field separator %s is treated as a "
2596 "decimal point in numbers"),
2597 quote (((char []) {decimal_point
, 0})));
2598 number_locale_warned
= true;
2600 else if (tab
== '-')
2603 _("field separator %s is treated as a "
2604 "minus sign in numbers"),
2605 quote (((char []) {tab
, 0})));
2607 else if (general_numeric_field_span
&& tab
== '+')
2610 _("field separator %s is treated as a "
2611 "plus sign in numbers"),
2612 quote (((char []) {tab
, 0})));
2616 /* Explicitly indicate the decimal point used in this locale,
2617 as it suggests that robust scripts need to consider
2618 setting the locale when comparing numbers. */
2619 if ((basic_numeric_field
|| general_numeric_field
) && ! number_locale_warned
)
2622 _("%snumbers use %s as a decimal point in this locale"),
2623 tab
== decimal_point
? "" : _("note "),
2624 quote (((char []) {decimal_point
, 0})));
2628 if (basic_numeric_field
&& thousands_sep_ignored
)
2631 _("the multi-byte number group separator "
2632 "in this locale is not supported"));
2635 /* Warn about ignored global options flagged above.
2636 This clears all flags if UGKEY is the only one in the list. */
2637 if (!default_key_compare (&ugkey
)
2638 || (ugkey
.reverse
&& (stable
|| unique
) && keylist
))
2640 bool ugkey_reverse
= ugkey
.reverse
;
2641 if (!(stable
|| unique
))
2642 ugkey
.reverse
= false;
2643 /* The following is too big, but guaranteed to be "big enough". */
2644 char opts
[sizeof short_options
];
2645 key_to_opts (&ugkey
, opts
);
2647 ngettext ("option '-%s' is ignored",
2648 "options '-%s' are ignored",
2649 select_plural (strlen (opts
))), opts
);
2650 ugkey
.reverse
= ugkey_reverse
;
2652 if (ugkey
.reverse
&& !(stable
|| unique
) && keylist
)
2653 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2656 /* Return either the sense of DIFF or its reverse, depending on REVERSED.
2657 If REVERSED, do not simply negate DIFF as that can mishandle INT_MIN. */
2660 diff_reversed (int diff
, bool reversed
)
2662 return reversed
? (diff
< 0) - (diff
> 0) : diff
;
2665 /* Compare two lines A and B trying every key in sequence until there
2666 are no more keys or a difference is found. */
2669 keycompare (struct line
const *a
, struct line
const *b
)
2671 struct keyfield
*key
= keylist
;
2673 /* For the first iteration only, the key positions have been
2674 precomputed for us. */
2675 char *texta
= a
->keybeg
;
2676 char *textb
= b
->keybeg
;
2677 char *lima
= a
->keylim
;
2678 char *limb
= b
->keylim
;
2684 char const *translate
= key
->translate
;
2685 bool const *ignore
= key
->ignore
;
2687 /* Treat field ends before field starts as empty fields. */
2688 lima
= MAX (texta
, lima
);
2689 limb
= MAX (textb
, limb
);
2691 /* Find the lengths. */
2692 size_t lena
= lima
- texta
;
2693 size_t lenb
= limb
- textb
;
2695 if (hard_LC_COLLATE
|| key_numeric (key
)
2696 || key
->month
|| key
->random
|| key
->version
)
2698 /* Ordinarily use the keys in-place, temporarily null-terminated. */
2701 size_t tlena
= lena
;
2702 size_t tlenb
= lenb
;
2703 char enda
= ta
[tlena
];
2704 char endb
= tb
[tlenb
];
2706 void *allocated
= nullptr;
2707 char stackbuf
[4000];
2709 if (ignore
|| translate
)
2711 /* Compute with copies of the keys, which are the result of
2712 translating or ignoring characters, and which need their
2717 /* Allocate space for copies. */
2718 size_t size
= lena
+ 1 + lenb
+ 1;
2719 if (size
<= sizeof stackbuf
)
2722 ta
= allocated
= xmalloc (size
);
2725 /* Put into each copy a version of the key in which the
2726 requested characters are ignored or translated. */
2727 for (tlena
= i
= 0; i
< lena
; i
++)
2728 if (! (ignore
&& ignore
[to_uchar (texta
[i
])]))
2729 ta
[tlena
++] = (translate
2730 ? translate
[to_uchar (texta
[i
])]
2733 for (tlenb
= i
= 0; i
< lenb
; i
++)
2734 if (! (ignore
&& ignore
[to_uchar (textb
[i
])]))
2735 tb
[tlenb
++] = (translate
2736 ? translate
[to_uchar (textb
[i
])]
2744 diff
= numcompare (ta
, tb
);
2745 else if (key
->general_numeric
)
2746 diff
= general_numcompare (ta
, tb
);
2747 else if (key
->human_numeric
)
2748 diff
= human_numcompare (ta
, tb
);
2749 else if (key
->month
)
2750 diff
= getmonth (ta
, nullptr) - getmonth (tb
, nullptr);
2751 else if (key
->random
)
2752 diff
= compare_random (ta
, tlena
, tb
, tlenb
);
2753 else if (key
->version
)
2754 diff
= filenvercmp (ta
, tlena
, tb
, tlenb
);
2757 /* Locale-dependent string sorting. This is slower than
2758 C-locale sorting, which is implemented below. */
2760 diff
= - NONZERO (tlenb
);
2761 else if (tlenb
== 0)
2764 diff
= xmemcoll0 (ta
, tlena
+ 1, tb
, tlenb
+ 1);
2774 #define CMP_WITH_IGNORE(A, B) \
2779 while (texta < lima && ignore[to_uchar (*texta)]) \
2781 while (textb < limb && ignore[to_uchar (*textb)]) \
2783 if (! (texta < lima && textb < limb)) \
2785 diff = (texta < lima) - (textb < limb); \
2788 diff = to_uchar (A) - to_uchar (B); \
2799 CMP_WITH_IGNORE (translate
[to_uchar (*texta
)],
2800 translate
[to_uchar (*textb
)]);
2802 CMP_WITH_IGNORE (*texta
, *textb
);
2806 size_t lenmin
= MIN (lena
, lenb
);
2814 diff
= (to_uchar (translate
[to_uchar (texta
[i
])])
2815 - to_uchar (translate
[to_uchar (textb
[i
])]));
2823 diff
= memcmp (texta
, textb
, lenmin
);
2826 diff
= (lena
> lenb
) - (lena
< lenb
);
2836 /* Find the beginning and limit of the next field. */
2837 if (key
->eword
!= SIZE_MAX
)
2838 lima
= limfield (a
, key
), limb
= limfield (b
, key
);
2840 lima
= a
->text
+ a
->length
- 1, limb
= b
->text
+ b
->length
- 1;
2842 if (key
->sword
!= SIZE_MAX
)
2843 texta
= begfield (a
, key
), textb
= begfield (b
, key
);
2846 texta
= a
->text
, textb
= b
->text
;
2847 if (key
->skipsblanks
)
2849 while (texta
< lima
&& blanks
[to_uchar (*texta
)])
2851 while (textb
< limb
&& blanks
[to_uchar (*textb
)])
2857 return diff_reversed (diff
, key
->reverse
);
2860 /* Compare two lines A and B, returning negative, zero, or positive
2861 depending on whether A compares less than, equal to, or greater than B. */
2864 compare (struct line
const *a
, struct line
const *b
)
2869 /* First try to compare on the specified keys (if any).
2870 The only two cases with no key at all are unadorned sort,
2871 and unadorned sort -r. */
2874 diff
= keycompare (a
, b
);
2875 if (diff
|| unique
|| stable
)
2879 /* If the keys all compare equal (or no keys were specified)
2880 fall through to the default comparison. */
2881 alen
= a
->length
- 1, blen
= b
->length
- 1;
2884 diff
= - NONZERO (blen
);
2887 else if (hard_LC_COLLATE
)
2889 /* xmemcoll0 is a performance enhancement as
2890 it will not unconditionally write '\0' after the
2891 passed in buffers, which was seen to give around
2892 a 3% increase in performance for short lines. */
2893 diff
= xmemcoll0 (a
->text
, alen
+ 1, b
->text
, blen
+ 1);
2897 diff
= memcmp (a
->text
, b
->text
, MIN (alen
, blen
));
2899 diff
= (alen
> blen
) - (alen
< blen
);
2902 return diff_reversed (diff
, reverse
);
2905 /* Write LINE to output stream FP; the output file's name is
2906 OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
2907 otherwise. If debugging is enabled and FP is standard output,
2908 append some debugging information. */
2911 write_line (struct line
const *line
, FILE *fp
, char const *output_file
)
2913 char *buf
= line
->text
;
2914 size_t n_bytes
= line
->length
;
2915 char *ebuf
= buf
+ n_bytes
;
2917 if (!output_file
&& debug
)
2919 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2920 char const *c
= buf
;
2929 if (fputc (wc
, fp
) == EOF
)
2930 sort_die (_("write failed"), output_file
);
2938 if (fwrite (buf
, 1, n_bytes
, fp
) != n_bytes
)
2939 sort_die (_("write failed"), output_file
);
2944 /* Check that the lines read from FILE_NAME come in order. Return
2945 true if they are in order. If CHECKONLY == 'c', also print a
2946 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2947 they are not in order. */
2950 check (char const *file_name
, char checkonly
)
2952 FILE *fp
= xfopen (file_name
, "r");
2953 struct buffer buf
; /* Input buffer. */
2954 struct line temp
; /* Copy of previous line. */
2956 uintmax_t line_number
= 0;
2957 struct keyfield
const *key
= keylist
;
2958 bool nonunique
= ! unique
;
2959 bool ordered
= true;
2961 initbuf (&buf
, sizeof (struct line
),
2962 MAX (merge_buffer_size
, sort_size
));
2963 temp
.text
= nullptr;
2965 while (fillbuf (&buf
, fp
, file_name
))
2967 struct line
const *line
= buffer_linelim (&buf
);
2968 struct line
const *linebase
= line
- buf
.nlines
;
2970 /* Make sure the line saved from the old buffer contents is
2971 less than or equal to the first line of the new buffer. */
2972 if (alloc
&& nonunique
<= compare (&temp
, line
- 1))
2976 if (checkonly
== 'c')
2978 struct line
const *disorder_line
= line
- 1;
2979 uintmax_t disorder_line_number
=
2980 buffer_linelim (&buf
) - disorder_line
+ line_number
;
2981 char hr_buf
[INT_BUFSIZE_BOUND (disorder_line_number
)];
2982 fprintf (stderr
, _("%s: %s:%s: disorder: "),
2983 program_name
, file_name
,
2984 umaxtostr (disorder_line_number
, hr_buf
));
2985 write_line (disorder_line
, stderr
, _("standard error"));
2993 /* Compare each line in the buffer with its successor. */
2994 while (linebase
< --line
)
2995 if (nonunique
<= compare (line
, line
- 1))
2996 goto found_disorder
;
2998 line_number
+= buf
.nlines
;
3000 /* Save the last line of the buffer. */
3001 if (alloc
< line
->length
)
3008 alloc
= line
->length
;
3012 while (alloc
< line
->length
);
3015 temp
.text
= xmalloc (alloc
);
3017 memcpy (temp
.text
, line
->text
, line
->length
);
3018 temp
.length
= line
->length
;
3021 temp
.keybeg
= temp
.text
+ (line
->keybeg
- line
->text
);
3022 temp
.keylim
= temp
.text
+ (line
->keylim
- line
->text
);
3026 xfclose (fp
, file_name
);
3032 /* Open FILES (there are NFILES of them) and store the resulting array
3033 of stream pointers into (*PFPS). Allocate the array. Return the
3034 number of successfully opened files, setting errno if this value is
3035 less than NFILES. */
3038 open_input_files (struct sortfile
*files
, size_t nfiles
, FILE ***pfps
)
3040 FILE **fps
= *pfps
= xnmalloc (nfiles
, sizeof *fps
);
3043 /* Open as many input files as we can. */
3044 for (i
= 0; i
< nfiles
; i
++)
3046 fps
[i
] = (files
[i
].temp
&& files
[i
].temp
->state
!= UNCOMPRESSED
3047 ? open_temp (files
[i
].temp
)
3048 : stream_open (files
[i
].name
, "r"));
3056 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3057 files (all of which are at the start of the FILES array), and
3058 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3059 FPS is the vector of open stream corresponding to the files.
3060 Close input and output streams before returning.
3061 OUTPUT_FILE gives the name of the output file. If it is null,
3062 the output file is standard output. */
3065 mergefps (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3066 FILE *ofp
, char const *output_file
, FILE **fps
)
3068 struct buffer
*buffer
= xnmalloc (nfiles
, sizeof *buffer
);
3069 /* Input buffers for each file. */
3070 struct line saved
; /* Saved line storage for unique check. */
3071 struct line
const *savedline
= nullptr;
3072 /* &saved if there is a saved line. */
3073 size_t savealloc
= 0; /* Size allocated for the saved line. */
3074 struct line
const **cur
= xnmalloc (nfiles
, sizeof *cur
);
3075 /* Current line in each line table. */
3076 struct line
const **base
= xnmalloc (nfiles
, sizeof *base
);
3077 /* Base of each line table. */
3078 size_t *ord
= xnmalloc (nfiles
, sizeof *ord
);
3079 /* Table representing a permutation of fps,
3080 such that cur[ord[0]] is the smallest line
3081 and will be next output. */
3085 struct keyfield
const *key
= keylist
;
3086 saved
.text
= nullptr;
3088 /* Read initial lines from each input file. */
3089 for (i
= 0; i
< nfiles
; )
3091 initbuf (&buffer
[i
], sizeof (struct line
),
3092 MAX (merge_buffer_size
, sort_size
/ nfiles
));
3093 if (fillbuf (&buffer
[i
], fps
[i
], files
[i
].name
))
3095 struct line
const *linelim
= buffer_linelim (&buffer
[i
]);
3096 cur
[i
] = linelim
- 1;
3097 base
[i
] = linelim
- buffer
[i
].nlines
;
3102 /* fps[i] is empty; eliminate it from future consideration. */
3103 xfclose (fps
[i
], files
[i
].name
);
3107 zaptemp (files
[i
].name
);
3109 free (buffer
[i
].buf
);
3111 for (j
= i
; j
< nfiles
; ++j
)
3113 files
[j
] = files
[j
+ 1];
3114 fps
[j
] = fps
[j
+ 1];
3119 /* Set up the ord table according to comparisons among input lines.
3120 Since this only reorders two items if one is strictly greater than
3121 the other, it is stable. */
3122 for (i
= 0; i
< nfiles
; ++i
)
3124 for (i
= 1; i
< nfiles
; ++i
)
3125 if (0 < compare (cur
[ord
[i
- 1]], cur
[ord
[i
]]))
3126 t
= ord
[i
- 1], ord
[i
- 1] = ord
[i
], ord
[i
] = t
, i
= 0;
3128 /* Repeatedly output the smallest line until no input remains. */
3131 struct line
const *smallest
= cur
[ord
[0]];
3133 /* If uniquified output is turned on, output only the first of
3134 an identical series of lines. */
3137 if (savedline
&& compare (savedline
, smallest
))
3139 savedline
= nullptr;
3140 write_line (&saved
, ofp
, output_file
);
3145 if (savealloc
< smallest
->length
)
3150 savealloc
= smallest
->length
;
3153 while ((savealloc
*= 2) < smallest
->length
);
3156 saved
.text
= xmalloc (savealloc
);
3158 saved
.length
= smallest
->length
;
3159 memcpy (saved
.text
, smallest
->text
, saved
.length
);
3163 saved
.text
+ (smallest
->keybeg
- smallest
->text
);
3165 saved
.text
+ (smallest
->keylim
- smallest
->text
);
3170 write_line (smallest
, ofp
, output_file
);
3172 /* Check if we need to read more lines into memory. */
3173 if (base
[ord
[0]] < smallest
)
3174 cur
[ord
[0]] = smallest
- 1;
3177 if (fillbuf (&buffer
[ord
[0]], fps
[ord
[0]], files
[ord
[0]].name
))
3179 struct line
const *linelim
= buffer_linelim (&buffer
[ord
[0]]);
3180 cur
[ord
[0]] = linelim
- 1;
3181 base
[ord
[0]] = linelim
- buffer
[ord
[0]].nlines
;
3185 /* We reached EOF on fps[ord[0]]. */
3186 for (i
= 1; i
< nfiles
; ++i
)
3187 if (ord
[i
] > ord
[0])
3190 xfclose (fps
[ord
[0]], files
[ord
[0]].name
);
3191 if (ord
[0] < ntemps
)
3194 zaptemp (files
[ord
[0]].name
);
3196 free (buffer
[ord
[0]].buf
);
3197 for (i
= ord
[0]; i
< nfiles
; ++i
)
3199 fps
[i
] = fps
[i
+ 1];
3200 files
[i
] = files
[i
+ 1];
3201 buffer
[i
] = buffer
[i
+ 1];
3202 cur
[i
] = cur
[i
+ 1];
3203 base
[i
] = base
[i
+ 1];
3205 for (i
= 0; i
< nfiles
; ++i
)
3206 ord
[i
] = ord
[i
+ 1];
3211 /* The new line just read in may be larger than other lines
3212 already in main memory; push it back in the queue until we
3213 encounter a line larger than it. Optimize for the common
3214 case where the new line is smallest. */
3219 size_t ord0
= ord
[0];
3220 size_t count_of_smaller_lines
;
3224 int cmp
= compare (cur
[ord0
], cur
[ord
[probe
]]);
3225 if (cmp
< 0 || (cmp
== 0 && ord0
< ord
[probe
]))
3229 probe
= (lo
+ hi
) / 2;
3232 count_of_smaller_lines
= lo
- 1;
3233 for (j
= 0; j
< count_of_smaller_lines
; j
++)
3234 ord
[j
] = ord
[j
+ 1];
3235 ord
[count_of_smaller_lines
] = ord0
;
3239 if (unique
&& savedline
)
3241 write_line (&saved
, ofp
, output_file
);
3245 xfclose (ofp
, output_file
);
3253 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3254 files (all of which are at the start of the FILES array), and
3255 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3256 Close input and output files before returning.
3257 OUTPUT_FILE gives the name of the output file.
3259 Return the number of files successfully merged. This number can be
3260 less than NFILES if we ran low on file descriptors, but in this
3261 case it is never less than 2. */
3264 mergefiles (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3265 FILE *ofp
, char const *output_file
)
3268 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
3269 if (nopened
< nfiles
&& nopened
< 2)
3270 sort_die (_("open failed"), files
[nopened
].name
);
3271 mergefps (files
, ntemps
, nopened
, ofp
, output_file
, fps
);
3275 /* Merge into T (of size NLINES) the two sorted arrays of lines
3276 LO (with NLINES / 2 members), and
3277 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3278 T and LO point just past their respective arrays, and the arrays
3279 are in reverse order. NLINES must be at least 2. */
3282 mergelines (struct line
*restrict t
, size_t nlines
,
3283 struct line
const *restrict lo
)
3285 size_t nlo
= nlines
/ 2;
3286 size_t nhi
= nlines
- nlo
;
3287 struct line
*hi
= t
- nlo
;
3290 if (compare (lo
- 1, hi
- 1) <= 0)
3295 /* HI must equal T now, and there is no need to copy from
3314 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3315 Do this all within one thread. NLINES must be at least 2.
3316 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3317 Otherwise the sort is in-place and TEMP is half-sized.
3318 The input and output arrays are in reverse order, and LINES and
3319 TEMP point just past the end of their respective arrays.
3321 Use a recursive divide-and-conquer algorithm, in the style
3322 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3323 the optimization suggested by exercise 5.2.4-10; this requires room
3324 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3325 writes that this memory optimization was originally published by
3326 D. A. Bell, Comp J. 1 (1958), 75. */
3329 sequential_sort (struct line
*restrict lines
, size_t nlines
,
3330 struct line
*restrict temp
, bool to_temp
)
3334 /* Declare 'swap' as int, not bool, to work around a bug
3335 <https://lists.gnu.org/r/bug-coreutils/2005-10/msg00086.html>
3336 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3337 int swap
= (0 < compare (&lines
[-1], &lines
[-2]));
3340 temp
[-1] = lines
[-1 - swap
];
3341 temp
[-2] = lines
[-2 + swap
];
3345 temp
[-1] = lines
[-1];
3346 lines
[-1] = lines
[-2];
3347 lines
[-2] = temp
[-1];
3352 size_t nlo
= nlines
/ 2;
3353 size_t nhi
= nlines
- nlo
;
3354 struct line
*lo
= lines
;
3355 struct line
*hi
= lines
- nlo
;
3357 sequential_sort (hi
, nhi
, temp
- (to_temp
? nlo
: 0), to_temp
);
3359 sequential_sort (lo
, nlo
, temp
, !to_temp
);
3364 struct line
const *sorted_lo
;
3375 mergelines (dest
, nlines
, sorted_lo
);
3379 static struct merge_node
*init_node (struct merge_node
*restrict
,
3380 struct merge_node
*restrict
,
3381 struct line
*, size_t, size_t, bool);
3384 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3385 lines, with destination DEST. */
3386 static struct merge_node
*
3387 merge_tree_init (size_t nthreads
, size_t nlines
, struct line
*dest
)
3389 struct merge_node
*merge_tree
= xmalloc (2 * sizeof *merge_tree
* nthreads
);
3391 struct merge_node
*root
= merge_tree
;
3392 root
->lo
= root
->hi
= root
->end_lo
= root
->end_hi
= nullptr;
3393 root
->dest
= nullptr;
3394 root
->nlo
= root
->nhi
= nlines
;
3395 root
->parent
= nullptr;
3396 root
->level
= MERGE_END
;
3397 root
->queued
= false;
3398 pthread_mutex_init (&root
->lock
, nullptr);
3400 init_node (root
, root
+ 1, dest
, nthreads
, nlines
, false);
3404 /* Destroy the merge tree. */
3406 merge_tree_destroy (size_t nthreads
, struct merge_node
*merge_tree
)
3408 size_t n_nodes
= nthreads
* 2;
3409 struct merge_node
*node
= merge_tree
;
3413 pthread_mutex_destroy (&node
->lock
);
3420 /* Initialize a merge tree node and its descendants. The node's
3421 parent is PARENT. The node and its descendants are taken from the
3422 array of nodes NODE_POOL. Their destination starts at DEST; they
3423 will consume NTHREADS threads. The total number of sort lines is
3424 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3427 static struct merge_node
*
3428 init_node (struct merge_node
*restrict parent
,
3429 struct merge_node
*restrict node_pool
,
3430 struct line
*dest
, size_t nthreads
,
3431 size_t total_lines
, bool is_lo_child
)
3433 size_t nlines
= (is_lo_child
? parent
->nlo
: parent
->nhi
);
3434 size_t nlo
= nlines
/ 2;
3435 size_t nhi
= nlines
- nlo
;
3436 struct line
*lo
= dest
- total_lines
;
3437 struct line
*hi
= lo
- nlo
;
3438 struct line
**parent_end
= (is_lo_child
? &parent
->end_lo
: &parent
->end_hi
);
3440 struct merge_node
*node
= node_pool
++;
3441 node
->lo
= node
->end_lo
= lo
;
3442 node
->hi
= node
->end_hi
= hi
;
3443 node
->dest
= parent_end
;
3446 node
->parent
= parent
;
3447 node
->level
= parent
->level
+ 1;
3448 node
->queued
= false;
3449 pthread_mutex_init (&node
->lock
, nullptr);
3453 size_t lo_threads
= nthreads
/ 2;
3454 size_t hi_threads
= nthreads
- lo_threads
;
3455 node
->lo_child
= node_pool
;
3456 node_pool
= init_node (node
, node_pool
, lo
, lo_threads
,
3458 node
->hi_child
= node_pool
;
3459 node_pool
= init_node (node
, node_pool
, hi
, hi_threads
,
3460 total_lines
, false);
3464 node
->lo_child
= nullptr;
3465 node
->hi_child
= nullptr;
3471 /* Compare two merge nodes A and B for priority. */
3474 compare_nodes (void const *a
, void const *b
)
3476 struct merge_node
const *nodea
= a
;
3477 struct merge_node
const *nodeb
= b
;
3478 if (nodea
->level
== nodeb
->level
)
3479 return (nodea
->nlo
+ nodea
->nhi
) < (nodeb
->nlo
+ nodeb
->nhi
);
3480 return nodea
->level
< nodeb
->level
;
3483 /* Lock a merge tree NODE. */
3486 lock_node (struct merge_node
*node
)
3488 pthread_mutex_lock (&node
->lock
);
3491 /* Unlock a merge tree NODE. */
3494 unlock_node (struct merge_node
*node
)
3496 pthread_mutex_unlock (&node
->lock
);
3499 /* Destroy merge QUEUE. */
3502 queue_destroy (struct merge_node_queue
*queue
)
3504 heap_free (queue
->priority_queue
);
3505 pthread_cond_destroy (&queue
->cond
);
3506 pthread_mutex_destroy (&queue
->mutex
);
3509 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3510 NTHREADS threads. */
3513 queue_init (struct merge_node_queue
*queue
, size_t nthreads
)
3515 /* Though it's highly unlikely all nodes are in the heap at the same
3516 time, the heap should accommodate all of them. Counting a null
3517 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3518 queue
->priority_queue
= heap_alloc (compare_nodes
, 2 * nthreads
);
3519 pthread_mutex_init (&queue
->mutex
, nullptr);
3520 pthread_cond_init (&queue
->cond
, nullptr);
3523 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3524 does not need to lock NODE. */
3527 queue_insert (struct merge_node_queue
*queue
, struct merge_node
*node
)
3529 pthread_mutex_lock (&queue
->mutex
);
3530 heap_insert (queue
->priority_queue
, node
);
3531 node
->queued
= true;
3532 pthread_cond_signal (&queue
->cond
);
3533 pthread_mutex_unlock (&queue
->mutex
);
3536 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3538 static struct merge_node
*
3539 queue_pop (struct merge_node_queue
*queue
)
3541 struct merge_node
*node
;
3542 pthread_mutex_lock (&queue
->mutex
);
3543 while (! (node
= heap_remove_top (queue
->priority_queue
)))
3544 pthread_cond_wait (&queue
->cond
, &queue
->mutex
);
3545 pthread_mutex_unlock (&queue
->mutex
);
3547 node
->queued
= false;
3551 /* Output LINE to TFP, unless -u is specified and the line compares
3552 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3553 is null if TFP is standard output.
3555 This function does not save the line for comparison later, so it is
3556 appropriate only for internal sort. */
3559 write_unique (struct line
const *line
, FILE *tfp
, char const *temp_output
)
3563 if (saved_line
.text
&& ! compare (line
, &saved_line
))
3568 write_line (line
, tfp
, temp_output
);
3571 /* Merge the lines currently available to a NODE in the binary
3572 merge tree. Merge a number of lines appropriate for this merge
3573 level, assuming TOTAL_LINES is the total number of lines.
3575 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3576 the name of TFP, or is null if TFP is standard output. */
3579 mergelines_node (struct merge_node
*restrict node
, size_t total_lines
,
3580 FILE *tfp
, char const *temp_output
)
3582 struct line
*lo_orig
= node
->lo
;
3583 struct line
*hi_orig
= node
->hi
;
3584 size_t to_merge
= MAX_MERGE (total_lines
, node
->level
);
3588 if (node
->level
> MERGE_ROOT
)
3590 /* Merge to destination buffer. */
3591 struct line
*dest
= *node
->dest
;
3592 while (node
->lo
!= node
->end_lo
&& node
->hi
!= node
->end_hi
&& to_merge
--)
3593 if (compare (node
->lo
- 1, node
->hi
- 1) <= 0)
3594 *--dest
= *--node
->lo
;
3596 *--dest
= *--node
->hi
;
3598 merged_lo
= lo_orig
- node
->lo
;
3599 merged_hi
= hi_orig
- node
->hi
;
3601 if (node
->nhi
== merged_hi
)
3602 while (node
->lo
!= node
->end_lo
&& to_merge
--)
3603 *--dest
= *--node
->lo
;
3604 else if (node
->nlo
== merged_lo
)
3605 while (node
->hi
!= node
->end_hi
&& to_merge
--)
3606 *--dest
= *--node
->hi
;
3611 /* Merge directly to output. */
3612 while (node
->lo
!= node
->end_lo
&& node
->hi
!= node
->end_hi
&& to_merge
--)
3614 if (compare (node
->lo
- 1, node
->hi
- 1) <= 0)
3615 write_unique (--node
->lo
, tfp
, temp_output
);
3617 write_unique (--node
->hi
, tfp
, temp_output
);
3620 merged_lo
= lo_orig
- node
->lo
;
3621 merged_hi
= hi_orig
- node
->hi
;
3623 if (node
->nhi
== merged_hi
)
3625 while (node
->lo
!= node
->end_lo
&& to_merge
--)
3626 write_unique (--node
->lo
, tfp
, temp_output
);
3628 else if (node
->nlo
== merged_lo
)
3630 while (node
->hi
!= node
->end_hi
&& to_merge
--)
3631 write_unique (--node
->hi
, tfp
, temp_output
);
3636 merged_lo
= lo_orig
- node
->lo
;
3637 merged_hi
= hi_orig
- node
->hi
;
3638 node
->nlo
-= merged_lo
;
3639 node
->nhi
-= merged_hi
;
3642 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3643 NODE's children has available lines and the other either has
3644 available lines or has exhausted its lines. */
3647 queue_check_insert (struct merge_node_queue
*queue
, struct merge_node
*node
)
3651 bool lo_avail
= (node
->lo
- node
->end_lo
) != 0;
3652 bool hi_avail
= (node
->hi
- node
->end_hi
) != 0;
3653 if (lo_avail
? hi_avail
|| ! node
->nhi
: hi_avail
&& ! node
->nlo
)
3654 queue_insert (queue
, node
);
3658 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3661 queue_check_insert_parent (struct merge_node_queue
*queue
,
3662 struct merge_node
*node
)
3664 if (node
->level
> MERGE_ROOT
)
3666 lock_node (node
->parent
);
3667 queue_check_insert (queue
, node
->parent
);
3668 unlock_node (node
->parent
);
3670 else if (node
->nlo
+ node
->nhi
== 0)
3672 /* If the MERGE_ROOT NODE has finished merging, insert the
3674 queue_insert (queue
, node
->parent
);
3678 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3679 some of those lines, until the MERGE_END node is popped.
3680 TOTAL_LINES is the total number of lines. If merging at the top
3681 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3682 null if TFP is standard output. */
3685 merge_loop (struct merge_node_queue
*queue
,
3686 size_t total_lines
, FILE *tfp
, char const *temp_output
)
3690 struct merge_node
*node
= queue_pop (queue
);
3692 if (node
->level
== MERGE_END
)
3695 /* Reinsert so other threads can pop it. */
3696 queue_insert (queue
, node
);
3699 mergelines_node (node
, total_lines
, tfp
, temp_output
);
3700 queue_check_insert (queue
, node
);
3701 queue_check_insert_parent (queue
, node
);
3708 static void sortlines (struct line
*restrict
, size_t, size_t,
3709 struct merge_node
*, struct merge_node_queue
*,
3710 FILE *, char const *);
3712 /* Thread arguments for sortlines_thread. */
3716 /* Source, i.e., the array of lines to sort. This points just past
3717 the end of the array. */
3720 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3723 /* Number of lines in LINES and DEST. */
3724 size_t const total_lines
;
3726 /* Merge node. Lines from this node and this node's sibling will merged
3727 to this node's parent. */
3728 struct merge_node
*const node
;
3730 /* The priority queue controlling available work for the entire
3732 struct merge_node_queue
*const queue
;
3734 /* If at the top level, the file to output to, and the file's name.
3735 If the file is standard output, the file's name is null. */
3737 char const *output_temp
;
3740 /* Like sortlines, except with a signature acceptable to pthread_create. */
3743 sortlines_thread (void *data
)
3745 struct thread_args
const *args
= data
;
3746 sortlines (args
->lines
, args
->nthreads
, args
->total_lines
,
3747 args
->node
, args
->queue
, args
->tfp
,
3752 /* Sort lines, possibly in parallel. The arguments are as in struct
3755 The algorithm has three phases: node creation, sequential sort,
3758 During node creation, sortlines recursively visits each node in the
3759 binary merge tree and creates a NODE structure corresponding to all the
3760 future line merging NODE is responsible for. For each call to
3761 sortlines, half the available threads are assigned to each recursive
3762 call, until a leaf node having only 1 available thread is reached.
3764 Each leaf node then performs two sequential sorts, one on each half of
3765 the lines it is responsible for. It records in its NODE structure that
3766 there are two sorted sublists available to merge from, and inserts its
3767 NODE into the priority queue.
3769 The binary merge phase then begins. Each thread drops into a loop
3770 where the thread retrieves a NODE from the priority queue, merges lines
3771 available to that NODE, and potentially insert NODE or its parent back
3772 into the queue if there are sufficient available lines for them to
3773 merge. This continues until all lines at all nodes of the merge tree
3774 have been merged. */
3777 sortlines (struct line
*restrict lines
, size_t nthreads
,
3778 size_t total_lines
, struct merge_node
*node
,
3779 struct merge_node_queue
*queue
, FILE *tfp
, char const *temp_output
)
3781 size_t nlines
= node
->nlo
+ node
->nhi
;
3783 /* Calculate thread arguments. */
3784 size_t lo_threads
= nthreads
/ 2;
3785 size_t hi_threads
= nthreads
- lo_threads
;
3787 struct thread_args args
= {lines
, lo_threads
, total_lines
,
3788 node
->lo_child
, queue
, tfp
, temp_output
};
3790 if (nthreads
> 1 && SUBTHREAD_LINES_HEURISTIC
<= nlines
3791 && pthread_create (&thread
, nullptr, sortlines_thread
, &args
) == 0)
3793 sortlines (lines
- node
->nlo
, hi_threads
, total_lines
,
3794 node
->hi_child
, queue
, tfp
, temp_output
);
3795 pthread_join (thread
, nullptr);
3799 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3800 Sort with 1 thread. */
3801 size_t nlo
= node
->nlo
;
3802 size_t nhi
= node
->nhi
;
3803 struct line
*temp
= lines
- total_lines
;
3805 sequential_sort (lines
- nlo
, nhi
, temp
- nlo
/ 2, false);
3807 sequential_sort (lines
, nlo
, temp
, false);
3809 /* Update merge NODE. No need to lock yet. */
3811 node
->hi
= lines
- nlo
;
3812 node
->end_lo
= lines
- nlo
;
3813 node
->end_hi
= lines
- nlo
- nhi
;
3815 queue_insert (queue
, node
);
3816 merge_loop (queue
, total_lines
, tfp
, temp_output
);
3820 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3821 the same as OUTFILE. If found, replace each with the same
3822 temporary copy that can be merged into OUTFILE without destroying
3823 OUTFILE before it is completely read. This temporary copy does not
3824 count as a merge temp, so don't worry about incrementing NTEMPS in
3825 the caller; final cleanup will remove it, not zaptemp.
3827 This test ensures that an otherwise-erroneous use like
3828 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3829 It's not clear that POSIX requires this nicety.
3830 Detect common error cases, but don't try to catch obscure cases like
3831 "cat ... FILE ... | sort -m -o FILE"
3832 where traditional "sort" doesn't copy the input and where
3833 people should know that they're getting into trouble anyway.
3834 Catching these obscure cases would slow down performance in
3838 avoid_trashing_input (struct sortfile
*files
, size_t ntemps
,
3839 size_t nfiles
, char const *outfile
)
3841 struct tempnode
*tempcopy
= nullptr;
3843 for (size_t i
= ntemps
; i
< nfiles
; i
++)
3845 bool is_stdin
= STREQ (files
[i
].name
, "-");
3849 if (outfile
&& STREQ (outfile
, files
[i
].name
) && !is_stdin
)
3853 struct stat
*outst
= get_outstatus ();
3858 ? fstat (STDIN_FILENO
, &instat
)
3859 : stat (files
[i
].name
, &instat
))
3861 && psame_inode (&instat
, outst
));
3869 tempcopy
= create_temp (&tftp
);
3870 mergefiles (&files
[i
], 0, 1, tftp
, tempcopy
->name
);
3873 files
[i
].name
= tempcopy
->name
;
3874 files
[i
].temp
= tempcopy
;
3879 /* Scan the input files to ensure all are accessible.
3880 Otherwise exit with a diagnostic.
3882 This will catch common issues with permissions etc.
3883 but will fail to notice issues where you can open but not read,
3884 like when a directory is specified on some systems.
3885 Catching these obscure cases could slow down performance in
3889 check_inputs (char *const *files
, size_t nfiles
)
3891 for (size_t i
= 0; i
< nfiles
; i
++)
3893 if (STREQ (files
[i
], "-"))
3896 if (euidaccess (files
[i
], R_OK
) != 0)
3897 sort_die (_("cannot read"), files
[i
]);
3901 /* Ensure a specified output file can be created or written to,
3902 and point stdout to it. Do not truncate the file.
3903 Exit with a diagnostic on failure. */
3906 check_output (char const *outfile
)
3910 int oflags
= O_WRONLY
| O_BINARY
| O_CLOEXEC
| O_CREAT
;
3911 int outfd
= open (outfile
, oflags
, MODE_RW_UGO
);
3913 sort_die (_("open failed"), outfile
);
3914 move_fd (outfd
, STDOUT_FILENO
);
3918 /* Merge the input FILES. NTEMPS is the number of files at the
3919 start of FILES that are temporary; it is zero at the top level.
3920 NFILES is the total number of files. Put the output in
3921 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3924 merge (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3925 char const *output_file
)
3927 while (nmerge
< nfiles
)
3929 /* Number of input files processed so far. */
3932 /* Number of output files generated so far. */
3935 /* nfiles % NMERGE; this counts input files that are left over
3936 after all full-sized merges have been done. */
3939 /* Number of easily-available slots at the next loop iteration. */
3942 /* Do as many NMERGE-size merges as possible. In the case that
3943 nmerge is bogus, increment by the maximum number of file
3944 descriptors allowed. */
3945 for (out
= in
= 0; nmerge
<= nfiles
- in
; out
++)
3948 struct tempnode
*temp
= create_temp (&tfp
);
3949 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nmerge
),
3950 nmerge
, tfp
, temp
->name
);
3951 ntemps
-= MIN (ntemps
, num_merged
);
3952 files
[out
].name
= temp
->name
;
3953 files
[out
].temp
= temp
;
3957 remainder
= nfiles
- in
;
3958 cheap_slots
= nmerge
- out
% nmerge
;
3960 if (cheap_slots
< remainder
)
3962 /* So many files remain that they can't all be put into the last
3963 NMERGE-sized output window. Do one more merge. Merge as few
3964 files as possible, to avoid needless I/O. */
3965 size_t nshortmerge
= remainder
- cheap_slots
+ 1;
3967 struct tempnode
*temp
= create_temp (&tfp
);
3968 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nshortmerge
),
3969 nshortmerge
, tfp
, temp
->name
);
3970 ntemps
-= MIN (ntemps
, num_merged
);
3971 files
[out
].name
= temp
->name
;
3972 files
[out
++].temp
= temp
;
3976 /* Put the remaining input files into the last NMERGE-sized output
3977 window, so they will be merged in the next pass. */
3978 memmove (&files
[out
], &files
[in
], (nfiles
- in
) * sizeof *files
);
3983 avoid_trashing_input (files
, ntemps
, nfiles
, output_file
);
3985 /* We aren't guaranteed that this final mergefiles will work, therefore we
3986 try to merge into the output, and then merge as much as we can into a
3987 temp file if we can't. Repeat. */
3991 /* Merge directly into the output file if possible. */
3993 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
3995 if (nopened
== nfiles
)
3997 FILE *ofp
= stream_open (output_file
, "w");
4000 mergefps (files
, ntemps
, nfiles
, ofp
, output_file
, fps
);
4003 if (errno
!= EMFILE
|| nopened
<= 2)
4004 sort_die (_("open failed"), output_file
);
4006 else if (nopened
<= 2)
4007 sort_die (_("open failed"), files
[nopened
].name
);
4009 /* We ran out of file descriptors. Close one of the input
4010 files, to gain a file descriptor. Then create a temporary
4011 file with our spare file descriptor. Retry if that failed
4012 (e.g., some other process could open a file between the time
4013 we closed and tried to create). */
4015 struct tempnode
*temp
;
4019 xfclose (fps
[nopened
], files
[nopened
].name
);
4020 temp
= maybe_create_temp (&tfp
, ! (nopened
<= 2));
4024 /* Merge into the newly allocated temporary. */
4025 mergefps (&files
[0], MIN (ntemps
, nopened
), nopened
, tfp
, temp
->name
,
4027 ntemps
-= MIN (ntemps
, nopened
);
4028 files
[0].name
= temp
->name
;
4029 files
[0].temp
= temp
;
4031 memmove (&files
[1], &files
[nopened
], (nfiles
- nopened
) * sizeof *files
);
4033 nfiles
-= nopened
- 1;
4037 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
4040 sort (char *const *files
, size_t nfiles
, char const *output_file
,
4045 bool output_file_created
= false;
4051 char const *temp_output
;
4052 char const *file
= *files
;
4053 FILE *fp
= xfopen (file
, "r");
4056 size_t bytes_per_line
;
4062 while (tmp
< nthreads
)
4067 bytes_per_line
= (mult
* sizeof (struct line
));
4070 bytes_per_line
= sizeof (struct line
) * 3 / 2;
4073 initbuf (&buf
, bytes_per_line
,
4074 sort_buffer_size (&fp
, 1, files
, nfiles
, bytes_per_line
));
4079 while (fillbuf (&buf
, fp
, file
))
4083 if (buf
.eof
&& nfiles
4084 && (bytes_per_line
+ 1
4085 < (buf
.alloc
- buf
.used
- bytes_per_line
* buf
.nlines
)))
4087 /* End of file, but there is more input and buffer room.
4088 Concatenate the next input file; this is faster in
4090 buf
.left
= buf
.used
;
4094 saved_line
.text
= nullptr;
4095 line
= buffer_linelim (&buf
);
4096 if (buf
.eof
&& !nfiles
&& !ntemps
&& !buf
.left
)
4099 tfp
= xfopen (output_file
, "w");
4100 temp_output
= output_file
;
4101 output_file_created
= true;
4106 temp_output
= create_temp (&tfp
)->name
;
4110 struct merge_node_queue queue
;
4111 queue_init (&queue
, nthreads
);
4112 struct merge_node
*merge_tree
=
4113 merge_tree_init (nthreads
, buf
.nlines
, line
);
4115 sortlines (line
, nthreads
, buf
.nlines
, merge_tree
+ 1,
4116 &queue
, tfp
, temp_output
);
4118 merge_tree_destroy (nthreads
, merge_tree
);
4119 queue_destroy (&queue
);
4122 write_unique (line
- 1, tfp
, temp_output
);
4124 xfclose (tfp
, temp_output
);
4126 if (output_file_created
)
4135 if (! output_file_created
)
4137 struct tempnode
*node
= temphead
;
4138 struct sortfile
*tempfiles
= xnmalloc (ntemps
, sizeof *tempfiles
);
4139 for (size_t i
= 0; node
; i
++)
4141 tempfiles
[i
].name
= node
->name
;
4142 tempfiles
[i
].temp
= node
;
4145 merge (tempfiles
, ntemps
, ntemps
, output_file
);
4152 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
4155 insertkey (struct keyfield
*key_arg
)
4157 struct keyfield
**p
;
4158 struct keyfield
*key
= xmemdup (key_arg
, sizeof *key
);
4160 for (p
= &keylist
; *p
; p
= &(*p
)->next
)
4163 key
->next
= nullptr;
4166 /* Report a bad field specification SPEC, with extra info MSGID. */
4169 badfieldspec (char const *spec
, char const *msgid
)
4171 error (SORT_FAILURE
, 0, _("%s: invalid field specification %s"),
4172 _(msgid
), quote (spec
));
4175 /* Report incompatible options. */
4178 incompatible_options (char const *opts
)
4180 error (SORT_FAILURE
, 0, _("options '-%s' are incompatible"), (opts
));
4183 /* Check compatibility of ordering options. */
4186 check_ordering_compatibility (void)
4188 struct keyfield
*key
;
4190 for (key
= keylist
; key
; key
= key
->next
)
4191 if (1 < (key
->numeric
+ key
->general_numeric
+ key
->human_numeric
4192 + key
->month
+ (key
->version
| key
->random
| !!key
->ignore
)))
4194 /* The following is too big, but guaranteed to be "big enough". */
4195 char opts
[sizeof short_options
];
4196 /* Clear flags we're not interested in. */
4197 key
->skipsblanks
= key
->skipeblanks
= key
->reverse
= false;
4198 key_to_opts (key
, opts
);
4199 incompatible_options (opts
);
4203 /* Parse the leading integer in STRING and store the resulting value
4204 (which must fit into size_t) into *VAL. Return the address of the
4205 suffix after the integer. If the value is too large, silently
4206 substitute SIZE_MAX. If MSGID is null, return nullptr after
4207 failure; otherwise, report MSGID and exit on failure. */
4210 parse_field_count (char const *string
, size_t *val
, char const *msgid
)
4215 switch (xstrtoumax (string
, &suffix
, 10, &n
, ""))
4218 case LONGINT_INVALID_SUFFIX_CHAR
:
4223 case LONGINT_OVERFLOW
:
4224 case LONGINT_OVERFLOW
| LONGINT_INVALID_SUFFIX_CHAR
:
4228 case LONGINT_INVALID
:
4230 error (SORT_FAILURE
, 0, _("%s: invalid count at start of %s"),
4231 _(msgid
), quote (string
));
4238 /* Handle interrupts and hangups. */
4241 sighandler (int sig
)
4244 signal (sig
, SIG_IGN
);
4248 signal (sig
, SIG_DFL
);
4252 /* Set the ordering options for KEY specified in S.
4253 Return the address of the first character in S that
4254 is not a valid ordering option.
4255 BLANKTYPE is the kind of blanks that 'b' should skip. */
4258 set_ordering (char const *s
, struct keyfield
*key
, enum blanktype blanktype
)
4265 if (blanktype
== bl_start
|| blanktype
== bl_both
)
4266 key
->skipsblanks
= true;
4267 if (blanktype
== bl_end
|| blanktype
== bl_both
)
4268 key
->skipeblanks
= true;
4271 key
->ignore
= nondictionary
;
4274 key
->translate
= fold_toupper
;
4277 key
->general_numeric
= true;
4280 key
->human_numeric
= true;
4283 /* Option order should not matter, so don't let -i override
4284 -d. -d implies -i, but -i does not imply -d. */
4286 key
->ignore
= nonprinting
;
4292 key
->numeric
= true;
4298 key
->reverse
= true;
4301 key
->version
= true;
4311 /* Initialize KEY. */
4313 static struct keyfield
*
4314 key_init (struct keyfield
*key
)
4316 memset (key
, 0, sizeof *key
);
4317 key
->eword
= SIZE_MAX
;
4322 main (int argc
, char **argv
)
4324 struct keyfield
*key
;
4325 struct keyfield key_buf
;
4326 struct keyfield gkey
;
4327 bool gkey_only
= false;
4331 bool mergeonly
= false;
4332 char *random_source
= nullptr;
4333 bool need_random
= false;
4334 size_t nthreads
= 0;
4336 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != nullptr);
4337 int posix_ver
= posix2_version ();
4338 bool traditional_usage
= ! (200112 <= posix_ver
&& posix_ver
< 200809);
4340 char *files_from
= nullptr;
4342 char const *outfile
= nullptr;
4345 initialize_main (&argc
, &argv
);
4346 set_program_name (argv
[0]);
4347 locale_ok
= !! setlocale (LC_ALL
, "");
4348 bindtextdomain (PACKAGE
, LOCALEDIR
);
4349 textdomain (PACKAGE
);
4351 initialize_exit_failure (SORT_FAILURE
);
4353 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
4354 #if HAVE_NL_LANGINFO
4355 hard_LC_TIME
= hard_locale (LC_TIME
);
4358 /* Get locale's representation of the decimal point. */
4360 struct lconv
const *locale
= localeconv ();
4362 /* If the locale doesn't define a decimal point, or if the decimal
4363 point is multibyte, use the C locale's decimal point. FIXME:
4364 add support for multibyte decimal points. */
4365 decimal_point
= locale
->decimal_point
[0];
4366 if (! decimal_point
|| locale
->decimal_point
[1])
4367 decimal_point
= '.';
4369 /* FIXME: add support for multibyte thousands separators. */
4370 thousands_sep
= locale
->thousands_sep
[0];
4371 if (thousands_sep
&& locale
->thousands_sep
[1])
4372 thousands_sep_ignored
= true;
4373 if (! thousands_sep
|| locale
->thousands_sep
[1])
4374 thousands_sep
= NON_CHAR
;
4377 have_read_stdin
= false;
4382 static int const sig
[] =
4384 /* The usual suspects. */
4385 SIGALRM
, SIGHUP
, SIGINT
, SIGPIPE
, SIGQUIT
, SIGTERM
,
4402 enum { nsigs
= ARRAY_CARDINALITY (sig
) };
4405 struct sigaction act
;
4407 sigemptyset (&caught_signals
);
4408 for (i
= 0; i
< nsigs
; i
++)
4410 sigaction (sig
[i
], nullptr, &act
);
4411 if (act
.sa_handler
!= SIG_IGN
)
4412 sigaddset (&caught_signals
, sig
[i
]);
4415 act
.sa_handler
= sighandler
;
4416 act
.sa_mask
= caught_signals
;
4419 for (i
= 0; i
< nsigs
; i
++)
4420 if (sigismember (&caught_signals
, sig
[i
]))
4421 sigaction (sig
[i
], &act
, nullptr);
4423 for (i
= 0; i
< nsigs
; i
++)
4424 if (signal (sig
[i
], SIG_IGN
) != SIG_IGN
)
4426 signal (sig
[i
], sighandler
);
4427 siginterrupt (sig
[i
], 1);
4431 signal (SIGCHLD
, SIG_DFL
); /* Don't inherit CHLD handling from parent. */
4433 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4434 atexit (exit_cleanup
);
4437 gkey
.sword
= SIZE_MAX
;
4439 files
= xnmalloc (argc
, sizeof *files
);
4443 /* Parse an operand as a file after "--" was seen; or if
4444 pedantic and a file was seen, unless the POSIX version
4445 is not 1003.1-2001 and -c was not seen and the operand is
4446 "-o FILE" or "-oFILE". */
4450 || (posixly_correct
&& nfiles
!= 0
4451 && ! (traditional_usage
4454 && argv
[optind
][0] == '-' && argv
[optind
][1] == 'o'
4455 && (argv
[optind
][2] || optind
+ 1 != argc
)))
4456 || ((c
= getopt_long (argc
, argv
, short_options
,
4462 files
[nfiles
++] = argv
[optind
++];
4468 if (optarg
[0] == '+')
4470 bool minus_pos_usage
= (optind
!= argc
&& argv
[optind
][0] == '-'
4471 && ISDIGIT (argv
[optind
][1]));
4472 traditional_usage
|= minus_pos_usage
&& !posixly_correct
;
4473 if (traditional_usage
)
4475 /* Treat +POS1 [-POS2] as a key if possible; but silently
4476 treat an operand as a file if it is not a valid +POS1. */
4477 key
= key_init (&key_buf
);
4478 s
= parse_field_count (optarg
+ 1, &key
->sword
, nullptr);
4480 s
= parse_field_count (s
+ 1, &key
->schar
, nullptr);
4481 if (! (key
->sword
|| key
->schar
))
4482 key
->sword
= SIZE_MAX
;
4483 if (! s
|| *set_ordering (s
, key
, bl_start
))
4487 if (minus_pos_usage
)
4489 char const *optarg1
= argv
[optind
++];
4490 s
= parse_field_count (optarg1
+ 1, &key
->eword
,
4491 N_("invalid number after '-'"));
4493 s
= parse_field_count (s
+ 1, &key
->echar
,
4494 N_("invalid number after '.'"));
4495 if (!key
->echar
&& key
->eword
)
4497 /* obsolescent syntax +A.x -B.y is equivalent to:
4498 -k A+1.x+1,B.y (when y = 0)
4499 -k A+1.x+1,B+1.y (when y > 0)
4500 So eword is decremented as in the -k case
4501 only when the end field (B) is specified and
4505 if (*set_ordering (s
, key
, bl_end
))
4506 badfieldspec (optarg1
,
4507 N_("stray character in field spec"));
4509 key
->traditional_used
= true;
4515 files
[nfiles
++] = optarg
;
4519 c
= XARGMATCH ("--sort", optarg
, sort_args
, sort_types
);
4536 set_ordering (str
, &gkey
, bl_both
);
4542 ? XARGMATCH ("--check", optarg
, check_args
, check_types
)
4547 if (checkonly
&& checkonly
!= c
)
4548 incompatible_options ("cC");
4552 case COMPRESS_PROGRAM_OPTION
:
4553 if (compress_program
&& !STREQ (compress_program
, optarg
))
4554 error (SORT_FAILURE
, 0, _("multiple compress programs specified"));
4555 compress_program
= optarg
;
4558 case DEBUG_PROGRAM_OPTION
:
4562 case FILES0_FROM_OPTION
:
4563 files_from
= optarg
;
4567 key
= key_init (&key_buf
);
4570 s
= parse_field_count (optarg
, &key
->sword
,
4571 N_("invalid number at field start"));
4574 /* Provoke with 'sort -k0' */
4575 badfieldspec (optarg
, N_("field number is zero"));
4579 s
= parse_field_count (s
+ 1, &key
->schar
,
4580 N_("invalid number after '.'"));
4583 /* Provoke with 'sort -k1.0' */
4584 badfieldspec (optarg
, N_("character offset is zero"));
4587 if (! (key
->sword
|| key
->schar
))
4588 key
->sword
= SIZE_MAX
;
4589 s
= set_ordering (s
, key
, bl_start
);
4592 key
->eword
= SIZE_MAX
;
4598 s
= parse_field_count (s
+ 1, &key
->eword
,
4599 N_("invalid number after ','"));
4602 /* Provoke with 'sort -k1,0' */
4603 badfieldspec (optarg
, N_("field number is zero"));
4607 s
= parse_field_count (s
+ 1, &key
->echar
,
4608 N_("invalid number after '.'"));
4610 s
= set_ordering (s
, key
, bl_end
);
4613 badfieldspec (optarg
, N_("stray character in field spec"));
4622 specify_nmerge (oi
, c
, optarg
);
4626 if (outfile
&& !STREQ (outfile
, optarg
))
4627 error (SORT_FAILURE
, 0, _("multiple output files specified"));
4631 case RANDOM_SOURCE_OPTION
:
4632 if (random_source
&& !STREQ (random_source
, optarg
))
4633 error (SORT_FAILURE
, 0, _("multiple random sources specified"));
4634 random_source
= optarg
;
4642 specify_sort_size (oi
, c
, optarg
);
4647 char newtab
= optarg
[0];
4649 error (SORT_FAILURE
, 0, _("empty tab"));
4652 if (STREQ (optarg
, "\\0"))
4656 /* Provoke with 'sort -txx'. Complain about
4657 "multi-character tab" instead of "multibyte tab", so
4658 that the diagnostic's wording does not need to be
4659 changed once multibyte characters are supported. */
4660 error (SORT_FAILURE
, 0, _("multi-character tab %s"),
4664 if (tab
!= TAB_DEFAULT
&& tab
!= newtab
)
4665 error (SORT_FAILURE
, 0, _("incompatible tabs"));
4671 add_temp_dir (optarg
);
4674 case PARALLEL_OPTION
:
4675 nthreads
= specify_nthreads (oi
, c
, optarg
);
4683 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4684 through Solaris 7. It is also accepted by many non-Solaris
4685 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4686 -y is marked as obsolete starting with Solaris 8 (1999), but is
4687 still accepted as of Solaris 10 prerelease (2004).
4689 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4690 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4691 and which in general ignores the argument after "-y" if it
4692 consists entirely of digits (it can even be empty). */
4693 if (optarg
== argv
[optind
- 1])
4696 for (p
= optarg
; ISDIGIT (*p
); p
++)
4698 optind
-= (*p
!= '\0');
4706 case_GETOPT_HELP_CHAR
;
4708 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
4711 usage (SORT_FAILURE
);
4717 /* When using --files0-from=F, you may not specify any files
4718 on the command-line. */
4721 error (0, 0, _("extra operand %s"), quoteaf (files
[0]));
4722 fprintf (stderr
, "%s\n",
4723 _("file operands cannot be combined with --files0-from"));
4724 usage (SORT_FAILURE
);
4727 FILE *stream
= xfopen (files_from
, "r");
4729 readtokens0_init (&tok
);
4731 if (! readtokens0 (stream
, &tok
))
4732 error (SORT_FAILURE
, 0, _("cannot read file names from %s"),
4733 quoteaf (files_from
));
4734 xfclose (stream
, files_from
);
4741 for (size_t i
= 0; i
< nfiles
; i
++)
4743 if (STREQ (files
[i
], "-"))
4744 error (SORT_FAILURE
, 0, _("when reading file names from stdin, "
4745 "no file name of %s allowed"),
4746 quoteaf (files
[i
]));
4747 else if (files
[i
][0] == '\0')
4749 /* Using the standard 'filename:line-number:' prefix here is
4750 not totally appropriate, since NUL is the separator,
4751 not NL, but it might be better than nothing. */
4752 unsigned long int file_number
= i
+ 1;
4753 error (SORT_FAILURE
, 0,
4754 _("%s:%lu: invalid zero-length file name"),
4755 quotef (files_from
), file_number
);
4760 error (SORT_FAILURE
, 0, _("no input from %s"),
4761 quoteaf (files_from
));
4764 /* Inheritance of global options to individual keys. */
4765 for (key
= keylist
; key
; key
= key
->next
)
4767 if (default_key_compare (key
) && !key
->reverse
)
4769 key
->ignore
= gkey
.ignore
;
4770 key
->translate
= gkey
.translate
;
4771 key
->skipsblanks
= gkey
.skipsblanks
;
4772 key
->skipeblanks
= gkey
.skipeblanks
;
4773 key
->month
= gkey
.month
;
4774 key
->numeric
= gkey
.numeric
;
4775 key
->general_numeric
= gkey
.general_numeric
;
4776 key
->human_numeric
= gkey
.human_numeric
;
4777 key
->version
= gkey
.version
;
4778 key
->random
= gkey
.random
;
4779 key
->reverse
= gkey
.reverse
;
4782 need_random
|= key
->random
;
4785 if (!keylist
&& !default_key_compare (&gkey
))
4789 need_random
|= gkey
.random
;
4792 check_ordering_compatibility ();
4796 if (checkonly
|| outfile
)
4798 static char opts
[] = "X --debug";
4799 opts
[0] = (checkonly
? checkonly
: 'o');
4800 incompatible_options (opts
);
4803 /* Always output the locale in debug mode, since this
4804 is such a common source of confusion. */
4806 /* OpenBSD can only set some categories with LC_ALL above,
4807 so set LC_COLLATE explicitly to flag errors. */
4809 locale_ok
= !! setlocale (LC_COLLATE
, "");
4811 error (0, 0, "%s", _("failed to set locale"));
4812 if (hard_LC_COLLATE
)
4813 error (0, 0, _("text ordering performed using %s sorting rules"),
4814 quote (setlocale (LC_COLLATE
, nullptr)));
4817 _("text ordering performed using simple byte comparison"));
4819 key_warnings (&gkey
, gkey_only
);
4822 reverse
= gkey
.reverse
;
4825 random_md5_state_init (random_source
);
4827 if (temp_dir_count
== 0)
4829 char const *tmp_dir
= getenv ("TMPDIR");
4830 add_temp_dir (tmp_dir
? tmp_dir
: DEFAULT_TMPDIR
);
4837 files
= xmalloc (sizeof *files
);
4838 *files
= (char *) "-";
4841 /* Need to re-check that we meet the minimum requirement for memory
4842 usage with the final value for NMERGE. */
4844 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
4849 error (SORT_FAILURE
, 0, _("extra operand %s not allowed with -%c"),
4850 quoteaf (files
[1]), checkonly
);
4854 static char opts
[] = {0, 'o', 0};
4855 opts
[0] = checkonly
;
4856 incompatible_options (opts
);
4859 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4860 input is not properly sorted. */
4861 exit (check (files
[0], checkonly
) ? EXIT_SUCCESS
: SORT_OUT_OF_ORDER
);
4864 /* Check all inputs are accessible, or exit immediately. */
4865 check_inputs (files
, nfiles
);
4867 /* Check output is writable, or exit immediately. */
4868 check_output (outfile
);
4872 struct sortfile
*sortfiles
= xcalloc (nfiles
, sizeof *sortfiles
);
4874 for (size_t i
= 0; i
< nfiles
; ++i
)
4875 sortfiles
[i
].name
= files
[i
];
4877 merge (sortfiles
, 0, nfiles
, outfile
);
4883 unsigned long int np
= num_processors (NPROC_CURRENT_OVERRIDABLE
);
4884 nthreads
= MIN (np
, DEFAULT_MAX_THREADS
);
4887 /* Avoid integer overflow later. */
4888 size_t nthreads_max
= SIZE_MAX
/ (2 * sizeof (struct merge_node
));
4889 nthreads
= MIN (nthreads
, nthreads_max
);
4891 sort (files
, nfiles
, outfile
, nthreads
);
4894 if (have_read_stdin
&& fclose (stdin
) == EOF
)
4895 sort_die (_("close failed"), "-");
4897 main_exit (EXIT_SUCCESS
);