1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/resource.h>
28 #include <sys/types.h>
36 #include "filevercmp.h"
37 #include "hard-locale.h"
40 #include "ignore-value.h"
48 #include "readtokens0.h"
51 #include "strnumcmp.h"
53 #include "xnanosleep.h"
57 struct rlimit
{ size_t rlim_cur
; };
58 # define getrlimit(Resource, Rlp) (-1)
61 /* The official name of this program (e.g., no 'g' prefix). */
62 #define PROGRAM_NAME "sort"
65 proper_name ("Mike Haertel"), \
66 proper_name ("Paul Eggert")
68 #if HAVE_LANGINFO_CODESET
69 # include <langinfo.h>
72 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
75 # define SA_NOCLDSTOP 0
76 /* No sigprocmask. Always 'return' zero. */
77 # define sigprocmask(How, Set, Oset) (0)
79 # if ! HAVE_SIGINTERRUPT
80 # define siginterrupt(sig, flag) /* empty */
84 #if !defined OPEN_MAX && defined NR_OPEN
85 # define OPEN_MAX NR_OPEN
91 #define UCHAR_LIM (UCHAR_MAX + 1)
94 # define long_double long double
96 # define long_double double
98 # define strtold strtod
101 #ifndef DEFAULT_TMPDIR
102 # define DEFAULT_TMPDIR "/tmp"
105 /* Maximum number of lines to merge every time a NODE is taken from
106 the merge queue. Node is at LEVEL in the binary merge tree,
107 and is responsible for merging TOTAL lines. */
108 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
110 /* Heuristic value for the number of lines for which it is worth creating
111 a subthread, during an internal merge sort. I.e., it is a small number
112 of "average" lines for which sorting via two threads is faster than
113 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
114 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
115 lines of gensort -a output is sorted slightly faster with --parallel=2
116 than with --parallel=1. By contrast, using --parallel=1 is about 10%
117 faster than using --parallel=2 with a 64K-line input. */
118 enum { SUBTHREAD_LINES_HEURISTIC
= 128 * 1024 };
119 verify (4 <= SUBTHREAD_LINES_HEURISTIC
);
121 /* The number of threads after which there are
122 diminishing performance gains. */
123 enum { DEFAULT_MAX_THREADS
= 8 };
128 /* POSIX says to exit with status 1 if invoked with -c and the
129 input is not properly sorted. */
130 SORT_OUT_OF_ORDER
= 1,
132 /* POSIX says any other irregular exit must exit with a status
133 code greater than 1. */
139 /* The number of times we should try to fork a compression process
140 (we retry if the fork call fails). We don't _need_ to compress
141 temp files, this is just to reduce disk access, so this number
142 can be small. Each retry doubles in duration. */
143 MAX_FORK_TRIES_COMPRESS
= 4,
145 /* The number of times we should try to fork a decompression process.
146 If we can't fork a decompression process, we can't sort, so this
147 number should be big. Each retry doubles in duration. */
148 MAX_FORK_TRIES_DECOMPRESS
= 9
153 /* Level of the end-of-merge node, one level above the root. */
156 /* Level of the root node in merge tree. */
160 /* The representation of the decimal point in the current locale. */
161 static int decimal_point
;
163 /* Thousands separator; if -1, then there isn't one. */
164 static int thousands_sep
;
166 /* Nonzero if the corresponding locales are hard. */
167 static bool hard_LC_COLLATE
;
169 static bool hard_LC_TIME
;
172 #define NONZERO(x) ((x) != 0)
174 /* The kind of blanks for '-b' to skip in various options. */
175 enum blanktype
{ bl_start
, bl_end
, bl_both
};
177 /* The character marking end of line. Default to \n. */
178 static char eolchar
= '\n';
180 /* Lines are held in core as counted strings. */
183 char *text
; /* Text of the line. */
184 size_t length
; /* Length including final newline. */
185 char *keybeg
; /* Start of first key. */
186 char *keylim
; /* Limit of first key. */
192 char *buf
; /* Dynamically allocated buffer,
193 partitioned into 3 regions:
196 - an array of lines, in reverse order. */
197 size_t used
; /* Number of bytes used for input data. */
198 size_t nlines
; /* Number of lines in the line array. */
199 size_t alloc
; /* Number of bytes allocated. */
200 size_t left
; /* Number of bytes left from previous reads. */
201 size_t line_bytes
; /* Number of bytes to reserve for each line. */
202 bool eof
; /* An EOF has been read. */
208 size_t sword
; /* Zero-origin 'word' to start at. */
209 size_t schar
; /* Additional characters to skip. */
210 size_t eword
; /* Zero-origin last 'word' of key. */
211 size_t echar
; /* Additional characters in field. */
212 bool const *ignore
; /* Boolean array of characters to ignore. */
213 char const *translate
; /* Translation applied to characters. */
214 bool skipsblanks
; /* Skip leading blanks when finding start. */
215 bool skipeblanks
; /* Skip leading blanks when finding end. */
216 bool numeric
; /* Flag for numeric comparison. Handle
217 strings of digits with optional decimal
218 point, but no exponential notation. */
219 bool random
; /* Sort by random hash of key. */
220 bool general_numeric
; /* Flag for general, numeric comparison.
221 Handle numbers in exponential notation. */
222 bool human_numeric
; /* Flag for sorting by human readable
223 units with either SI or IEC prefixes. */
224 bool month
; /* Flag for comparison by month name. */
225 bool reverse
; /* Reverse the sense of comparison. */
226 bool version
; /* sort by version number */
227 bool obsolete_used
; /* obsolescent key option format is used. */
228 struct keyfield
*next
; /* Next keyfield to try. */
237 /* Binary merge tree node. */
240 struct line
*lo
; /* Lines to merge from LO child node. */
241 struct line
*hi
; /* Lines to merge from HI child ndoe. */
242 struct line
*end_lo
; /* End of available lines from LO. */
243 struct line
*end_hi
; /* End of available lines from HI. */
244 struct line
**dest
; /* Pointer to destination of merge. */
245 size_t nlo
; /* Total Lines remaining from LO. */
246 size_t nhi
; /* Total lines remaining from HI. */
247 struct merge_node
*parent
; /* Parent node. */
248 struct merge_node
*lo_child
; /* LO child node. */
249 struct merge_node
*hi_child
; /* HI child node. */
250 unsigned int level
; /* Level in merge tree. */
251 bool queued
; /* Node is already in heap. */
252 pthread_mutex_t lock
; /* Lock for node operations. */
255 /* Priority queue of merge nodes. */
256 struct merge_node_queue
258 struct heap
*priority_queue
; /* Priority queue of merge tree nodes. */
259 pthread_mutex_t mutex
; /* Lock for queue operations. */
260 pthread_cond_t cond
; /* Conditional wait for empty queue to populate
264 /* Used to implement --unique (-u). */
265 static struct line saved_line
;
267 /* FIXME: None of these tables work with multibyte character sets.
268 Also, there are many other bugs when handling multibyte characters.
269 One way to fix this is to rewrite 'sort' to use wide characters
270 internally, but doing this with good performance is a bit
273 /* Table of blanks. */
274 static bool blanks
[UCHAR_LIM
];
276 /* Table of non-printing characters. */
277 static bool nonprinting
[UCHAR_LIM
];
279 /* Table of non-dictionary characters (not letters, digits, or blanks). */
280 static bool nondictionary
[UCHAR_LIM
];
282 /* Translation table folding lower case to upper. */
283 static char fold_toupper
[UCHAR_LIM
];
285 #define MONTHS_PER_YEAR 12
287 /* Table mapping month names to integers.
288 Alphabetic order allows binary search. */
289 static struct month monthtab
[] =
305 /* During the merge phase, the number of files to merge at once. */
306 #define NMERGE_DEFAULT 16
308 /* Minimum size for a merge or check buffer. */
309 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
311 /* Minimum sort size; the code might not work with smaller sizes. */
312 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
314 /* The number of bytes needed for a merge or check buffer, which can
315 function relatively efficiently even if it holds only one line. If
316 a longer line is seen, this value is increased. */
317 static size_t merge_buffer_size
= MAX (MIN_MERGE_BUFFER_SIZE
, 256 * 1024);
319 /* The approximate maximum number of bytes of main memory to use, as
320 specified by the user. Zero if the user has not specified a size. */
321 static size_t sort_size
;
323 /* The initial allocation factor for non-regular files.
324 This is used, e.g., when reading from a pipe.
325 Don't make it too big, since it is multiplied by ~130 to
326 obtain the size of the actual buffer sort will allocate.
327 Also, there may be 8 threads all doing this at the same time. */
328 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
330 /* Array of directory names in which any temporary files are to be created. */
331 static char const **temp_dirs
;
333 /* Number of temporary directory names used. */
334 static size_t temp_dir_count
;
336 /* Number of allocated slots in temp_dirs. */
337 static size_t temp_dir_alloc
;
339 /* Flag to reverse the order of all comparisons. */
342 /* Flag for stable sort. This turns off the last ditch bytewise
343 comparison of lines, and instead leaves lines in the same order
344 they were read if all keys compare equal. */
347 /* If TAB has this value, blanks separate fields. */
348 enum { TAB_DEFAULT
= CHAR_MAX
+ 1 };
350 /* Tab character separating fields. If TAB_DEFAULT, then fields are
351 separated by the empty string between a non-blank character and a blank
353 static int tab
= TAB_DEFAULT
;
355 /* Flag to remove consecutive duplicate lines from the output.
356 Only the last of a sequence of equal lines will be output. */
359 /* Nonzero if any of the input files are the standard input. */
360 static bool have_read_stdin
;
362 /* List of key field comparisons to be tried. */
363 static struct keyfield
*keylist
;
365 /* Program used to (de)compress temp files. Must accept -d. */
366 static char const *compress_program
;
368 /* Annotate the output with extra info to aid the user. */
371 /* Maximum number of files to merge in one go. If more than this
372 number are present, temp files will be used. */
373 static unsigned int nmerge
= NMERGE_DEFAULT
;
375 /* Output an error to stderr using async-signal-safe routines, and _exit().
376 This can be used safely from signal handlers,
377 and between fork() and exec() of multithreaded processes. */
379 static void async_safe_die (int, const char *) ATTRIBUTE_NORETURN
;
381 async_safe_die (int errnum
, const char *errstr
)
383 ignore_value (write (STDERR_FILENO
, errstr
, strlen (errstr
)));
385 /* Even if defined HAVE_STRERROR_R, we can't use it,
386 as it may return a translated string etc. and even if not
387 may malloc() which is unsafe. We might improve this
388 by testing for sys_errlist and using that if available.
389 For now just report the error number. */
392 char errbuf
[INT_BUFSIZE_BOUND (errnum
)];
393 char *p
= inttostr (errnum
, errbuf
);
394 ignore_value (write (STDERR_FILENO
, ": errno ", 8));
395 ignore_value (write (STDERR_FILENO
, p
, strlen (p
)));
398 ignore_value (write (STDERR_FILENO
, "\n", 1));
400 _exit (SORT_FAILURE
);
403 /* Report MESSAGE for FILE, then clean up and exit.
404 If FILE is null, it represents standard output. */
406 static void die (char const *, char const *) ATTRIBUTE_NORETURN
;
408 die (char const *message
, char const *file
)
410 error (0, errno
, "%s: %s", message
,
411 quotef (file
? file
: _("standard output")));
418 if (status
!= EXIT_SUCCESS
)
423 Usage: %s [OPTION]... [FILE]...\n\
424 or: %s [OPTION]... --files0-from=F\n\
426 program_name
, program_name
);
428 Write sorted concatenation of all FILE(s) to standard output.\n\
432 emit_mandatory_arg_note ();
439 -b, --ignore-leading-blanks ignore leading blanks\n\
440 -d, --dictionary-order consider only blanks and alphanumeric characters\
442 -f, --ignore-case fold lower case to upper case characters\n\
445 -g, --general-numeric-sort compare according to general numerical value\n\
446 -i, --ignore-nonprinting consider only printable characters\n\
447 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
450 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
453 -n, --numeric-sort compare according to string numerical value\n\
454 -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
455 --random-source=FILE get random bytes from FILE\n\
456 -r, --reverse reverse the result of comparisons\n\
459 --sort=WORD sort according to WORD:\n\
460 general-numeric -g, human-numeric -h, month -M,\
462 numeric -n, random -R, version -V\n\
463 -V, --version-sort natural sort of (version) numbers within text\n\
471 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
472 for more use temp files\n\
475 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
476 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
478 --compress-program=PROG compress temporaries with PROG;\n\
479 decompress them with PROG -d\n\
482 --debug annotate the part of the line used to sort,\n\
483 and warn about questionable usage to stderr\n\
484 --files0-from=F read input from the files specified by\n\
485 NUL-terminated names in file F;\n\
486 If F is - then read names from standard input\n\
489 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
490 -m, --merge merge already sorted files; do not sort\n\
493 -o, --output=FILE write result to FILE instead of standard output\n\
494 -s, --stable stabilize sort by disabling last-resort comparison\
496 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
499 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
500 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
501 multiple options specify multiple directories\n\
502 --parallel=N change the number of sorts run concurrently to N\n\
503 -u, --unique with -c, check for strict ordering;\n\
504 without -c, output only the first of an equal run\
508 -z, --zero-terminated line delimiter is NUL, not newline\n\
510 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
511 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
514 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
515 field number and C a character position in the field; both are origin 1, and\n\
516 the stop position defaults to the line's end. If neither -t nor -b is in\n\
517 effect, characters in a field are counted from the beginning of the preceding\n\
518 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
520 which override global ordering options for that key. If no key is given, use\n\
521 the entire line as the key.\n\
523 SIZE may be followed by the following multiplicative suffixes:\n\
526 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
529 The locale specified by the environment affects sort order.\n\
530 Set LC_ALL=C to get the traditional sort order that uses\n\
531 native byte values.\n\
533 emit_ancillary_info (PROGRAM_NAME
);
539 /* For long options that have no equivalent short option, use a
540 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
543 CHECK_OPTION
= CHAR_MAX
+ 1,
544 COMPRESS_PROGRAM_OPTION
,
545 DEBUG_PROGRAM_OPTION
,
548 RANDOM_SOURCE_OPTION
,
553 static char const short_options
[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
555 static struct option
const long_options
[] =
557 {"ignore-leading-blanks", no_argument
, NULL
, 'b'},
558 {"check", optional_argument
, NULL
, CHECK_OPTION
},
559 {"compress-program", required_argument
, NULL
, COMPRESS_PROGRAM_OPTION
},
560 {"debug", no_argument
, NULL
, DEBUG_PROGRAM_OPTION
},
561 {"dictionary-order", no_argument
, NULL
, 'd'},
562 {"ignore-case", no_argument
, NULL
, 'f'},
563 {"files0-from", required_argument
, NULL
, FILES0_FROM_OPTION
},
564 {"general-numeric-sort", no_argument
, NULL
, 'g'},
565 {"ignore-nonprinting", no_argument
, NULL
, 'i'},
566 {"key", required_argument
, NULL
, 'k'},
567 {"merge", no_argument
, NULL
, 'm'},
568 {"month-sort", no_argument
, NULL
, 'M'},
569 {"numeric-sort", no_argument
, NULL
, 'n'},
570 {"human-numeric-sort", no_argument
, NULL
, 'h'},
571 {"version-sort", no_argument
, NULL
, 'V'},
572 {"random-sort", no_argument
, NULL
, 'R'},
573 {"random-source", required_argument
, NULL
, RANDOM_SOURCE_OPTION
},
574 {"sort", required_argument
, NULL
, SORT_OPTION
},
575 {"output", required_argument
, NULL
, 'o'},
576 {"reverse", no_argument
, NULL
, 'r'},
577 {"stable", no_argument
, NULL
, 's'},
578 {"batch-size", required_argument
, NULL
, NMERGE_OPTION
},
579 {"buffer-size", required_argument
, NULL
, 'S'},
580 {"field-separator", required_argument
, NULL
, 't'},
581 {"temporary-directory", required_argument
, NULL
, 'T'},
582 {"unique", no_argument
, NULL
, 'u'},
583 {"zero-terminated", no_argument
, NULL
, 'z'},
584 {"parallel", required_argument
, NULL
, PARALLEL_OPTION
},
585 {GETOPT_HELP_OPTION_DECL
},
586 {GETOPT_VERSION_OPTION_DECL
},
590 #define CHECK_TABLE \
592 _ct_("silent", 'C') \
593 _ct_("diagnose-first", 'c')
595 static char const *const check_args
[] =
597 #define _ct_(_s, _c) _s,
601 static char const check_types
[] =
603 #define _ct_(_s, _c) _c,
609 _st_("general-numeric", 'g') \
610 _st_("human-numeric", 'h') \
612 _st_("numeric", 'n') \
613 _st_("random", 'R') \
616 static char const *const sort_args
[] =
618 #define _st_(_s, _c) _s,
622 static char const sort_types
[] =
624 #define _st_(_s, _c) _c,
629 /* The set of signals that are caught. */
630 static sigset_t caught_signals
;
632 /* Critical section status. */
639 /* Enter a critical section. */
640 static struct cs_status
643 struct cs_status status
;
644 status
.valid
= (sigprocmask (SIG_BLOCK
, &caught_signals
, &status
.sigs
) == 0);
648 /* Leave a critical section. */
650 cs_leave (struct cs_status status
)
654 /* Ignore failure when restoring the signal mask. */
655 sigprocmask (SIG_SETMASK
, &status
.sigs
, NULL
);
659 /* Possible states for a temp file. If compressed, the file's status
660 is unreaped or reaped, depending on whether 'sort' has waited for
661 the subprocess to finish. */
662 enum { UNCOMPRESSED
, UNREAPED
, REAPED
};
664 /* The list of temporary files. */
667 struct tempnode
*volatile next
;
668 pid_t pid
; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
670 char name
[1]; /* Actual size is 1 + file name length. */
672 static struct tempnode
*volatile temphead
;
673 static struct tempnode
*volatile *temptail
= &temphead
;
675 /* A file to be sorted. */
678 /* The file's name. */
681 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
682 struct tempnode
*temp
;
685 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
686 static Hash_table
*proctab
;
688 enum { INIT_PROCTAB_SIZE
= 47 };
691 proctab_hasher (void const *entry
, size_t tabsize
)
693 struct tempnode
const *node
= entry
;
694 return node
->pid
% tabsize
;
698 proctab_comparator (void const *e1
, void const *e2
)
700 struct tempnode
const *n1
= e1
;
701 struct tempnode
const *n2
= e2
;
702 return n1
->pid
== n2
->pid
;
705 /* The number of unreaped child processes. */
708 static bool delete_proc (pid_t
);
710 /* If PID is positive, wait for the child process with that PID to
711 exit, and assume that PID has already been removed from the process
712 table. If PID is 0 or -1, clean up some child that has exited (by
713 waiting for it, and removing it from the proc table) and return the
714 child's process ID. However, if PID is 0 and no children have
715 exited, return 0 without waiting. */
721 pid_t cpid
= waitpid ((pid
? pid
: -1), &status
, (pid
? 0 : WNOHANG
));
724 error (SORT_FAILURE
, errno
, _("waiting for %s [-d]"),
725 quoteaf (compress_program
));
726 else if (0 < cpid
&& (0 < pid
|| delete_proc (cpid
)))
728 if (! WIFEXITED (status
) || WEXITSTATUS (status
))
729 error (SORT_FAILURE
, 0, _("%s [-d] terminated abnormally"),
730 quoteaf (compress_program
));
737 /* TEMP represents a new process; add it to the process table. Create
738 the process table the first time it's called. */
741 register_proc (struct tempnode
*temp
)
745 proctab
= hash_initialize (INIT_PROCTAB_SIZE
, NULL
,
753 temp
->state
= UNREAPED
;
755 if (! hash_insert (proctab
, temp
))
759 /* If PID is in the process table, remove it and return true.
760 Otherwise, return false. */
763 delete_proc (pid_t pid
)
765 struct tempnode test
;
768 struct tempnode
*node
= hash_delete (proctab
, &test
);
771 node
->state
= REAPED
;
775 /* Remove PID from the process table, and wait for it to exit if it
779 wait_proc (pid_t pid
)
781 if (delete_proc (pid
))
785 /* Reap any exited children. Do not block; reap only those that have
791 while (0 < nprocs
&& reap (0))
795 /* Reap at least one exited child, waiting if necessary. */
804 /* Reap all children, waiting if necessary. */
813 /* Clean up any remaining temporary files. */
818 struct tempnode
const *node
;
820 for (node
= temphead
; node
; node
= node
->next
)
825 /* Cleanup actions to take when exiting. */
832 /* Clean up any remaining temporary files in a critical section so
833 that a signal handler does not try to clean them too. */
834 struct cs_status cs
= cs_enter ();
842 /* Create a new temporary file, returning its newly allocated tempnode.
843 Store into *PFD the file descriptor open for writing.
844 If the creation fails, return NULL and store -1 into *PFD if the
845 failure is due to file descriptor exhaustion and
846 SURVIVE_FD_EXHAUSTION; otherwise, die. */
848 static struct tempnode
*
849 create_temp_file (int *pfd
, bool survive_fd_exhaustion
)
851 static char const slashbase
[] = "/sortXXXXXX";
852 static size_t temp_dir_index
;
855 char const *temp_dir
= temp_dirs
[temp_dir_index
];
856 size_t len
= strlen (temp_dir
);
857 struct tempnode
*node
=
858 xmalloc (offsetof (struct tempnode
, name
) + len
+ sizeof slashbase
);
859 char *file
= node
->name
;
862 memcpy (file
, temp_dir
, len
);
863 memcpy (file
+ len
, slashbase
, sizeof slashbase
);
865 if (++temp_dir_index
== temp_dir_count
)
868 /* Create the temporary file in a critical section, to avoid races. */
874 temptail
= &node
->next
;
882 if (! (survive_fd_exhaustion
&& errno
== EMFILE
))
883 error (SORT_FAILURE
, errno
, _("cannot create temporary file in %s"),
893 /* Return a stream for FILE, opened with mode HOW. A null FILE means
894 standard output; HOW should be "w". When opening for input, "-"
895 means standard input. To avoid confusion, do not return file
896 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
897 opening an ordinary FILE. Return NULL if unsuccessful.
899 fadvise() is used to specify an access pattern for input files.
900 There are a few hints we could possibly provide,
901 and after careful testing it was decided that
902 specifying POSIX_FADV_SEQUENTIAL was not detrimental
903 to any cases. On Linux 2.6.31, this option doubles
904 the size of read ahead performed and thus was seen to
907 Sorting with a smaller internal buffer
908 Reading from faster flash devices
910 In _addition_ one could also specify other hints...
912 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
913 at least uses that to _synchronously_ prepopulate the cache
914 with the specified range. While sort does need to
915 read all of its input before outputting, a synchronous
916 read of the whole file up front precludes any processing
917 that sort could do in parallel with the system doing
918 read ahead of the data. This was seen to have negative effects
919 in a couple of cases:
921 Sorting with a smaller internal buffer
922 Note this option was seen to shorten the runtime for sort
923 on a multicore system with lots of RAM and other processes
924 competing for CPU. It could be argued that more explicit
925 scheduling hints with 'nice' et. al. are more appropriate
928 POSIX_FADV_NOREUSE is a possibility as it could lower
929 the priority of input data in the cache as sort will
930 only need to process it once. However its functionality
931 has changed over Linux kernel versions and as of 2.6.31
932 it does nothing and thus we can't depend on what it might
935 POSIX_FADV_DONTNEED is not appropriate for user specified
936 input files, but for temp files we do want to drop the
937 cache immediately after processing. This is done implicitly
938 however when the files are unlinked. */
941 stream_open (char const *file
, char const *how
)
947 if (STREQ (file
, "-"))
949 have_read_stdin
= true;
953 fp
= fopen (file
, how
);
954 fadvise (fp
, FADVISE_SEQUENTIAL
);
956 else if (*how
== 'w')
958 if (file
&& ftruncate (STDOUT_FILENO
, 0) != 0)
959 error (SORT_FAILURE
, errno
, _("%s: error truncating"),
964 assert (!"unexpected mode passed to stream_open");
969 /* Same as stream_open, except always return a non-null value; die on
973 xfopen (char const *file
, char const *how
)
975 FILE *fp
= stream_open (file
, how
);
977 die (_("open failed"), file
);
981 /* Close FP, whose name is FILE, and report any errors. */
984 xfclose (FILE *fp
, char const *file
)
989 /* Allow reading stdin from tty more than once. */
995 /* Don't close stdout just yet. close_stdout does that. */
996 if (fflush (fp
) != 0)
997 die (_("fflush failed"), file
);
1001 if (fclose (fp
) != 0)
1002 die (_("close failed"), file
);
1008 move_fd_or_die (int oldfd
, int newfd
)
1012 /* This should never fail for our usage. */
1013 dup2 (oldfd
, newfd
);
1018 /* Fork a child process for piping to and do common cleanup. The
1019 TRIES parameter tells us how many times to try to fork before
1020 giving up. Return the PID of the child, or -1 (setting errno)
1024 pipe_fork (int pipefds
[2], size_t tries
)
1026 #if HAVE_WORKING_FORK
1027 struct tempnode
*saved_temphead
;
1029 double wait_retry
= 0.25;
1030 pid_t pid
IF_LINT ( = -1);
1031 struct cs_status cs
;
1033 if (pipe (pipefds
) < 0)
1036 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1037 uncontrolled subprocess generation can hurt performance significantly.
1038 Allow at most NMERGE + 2 subprocesses, on the theory that there
1039 may be some useful parallelism by letting compression for the
1040 previous merge finish (1 subprocess) in parallel with the current
1041 merge (NMERGE + 1 subprocesses). */
1043 if (nmerge
+ 1 < nprocs
)
1048 /* This is so the child process won't delete our temp files
1049 if it receives a signal before exec-ing. */
1051 saved_temphead
= temphead
;
1055 saved_errno
= errno
;
1057 temphead
= saved_temphead
;
1060 errno
= saved_errno
;
1062 if (0 <= pid
|| errno
!= EAGAIN
)
1066 xnanosleep (wait_retry
);
1074 saved_errno
= errno
;
1077 errno
= saved_errno
;
1081 close (STDIN_FILENO
);
1082 close (STDOUT_FILENO
);
1089 #else /* ! HAVE_WORKING_FORK */
1094 /* Create a temporary file and, if asked for, start a compressor
1095 to that file. Set *PFP to the file handle and return
1096 the address of the new temp node. If the creation
1097 fails, return NULL if the failure is due to file descriptor
1098 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1100 static struct tempnode
*
1101 maybe_create_temp (FILE **pfp
, bool survive_fd_exhaustion
)
1104 struct tempnode
*node
= create_temp_file (&tempfd
, survive_fd_exhaustion
);
1108 node
->state
= UNCOMPRESSED
;
1110 if (compress_program
)
1114 node
->pid
= pipe_fork (pipefds
, MAX_FORK_TRIES_COMPRESS
);
1119 tempfd
= pipefds
[1];
1121 register_proc (node
);
1123 else if (node
->pid
== 0)
1125 /* Being the child of a multithreaded program before exec(),
1126 we're restricted to calling async-signal-safe routines here. */
1128 move_fd_or_die (tempfd
, STDOUT_FILENO
);
1129 move_fd_or_die (pipefds
[0], STDIN_FILENO
);
1131 execlp (compress_program
, compress_program
, (char *) NULL
);
1133 async_safe_die (errno
, "couldn't execute compress program");
1137 *pfp
= fdopen (tempfd
, "w");
1139 die (_("couldn't create temporary file"), node
->name
);
1144 /* Create a temporary file and, if asked for, start a compressor
1145 to that file. Set *PFP to the file handle and return the address
1146 of the new temp node. Die on failure. */
1148 static struct tempnode
*
1149 create_temp (FILE **pfp
)
1151 return maybe_create_temp (pfp
, false);
1154 /* Open a compressed temp file and start a decompression process through
1155 which to filter the input. Return NULL (setting errno to
1156 EMFILE) if we ran out of file descriptors, and die on any other
1160 open_temp (struct tempnode
*temp
)
1162 int tempfd
, pipefds
[2];
1165 if (temp
->state
== UNREAPED
)
1166 wait_proc (temp
->pid
);
1168 tempfd
= open (temp
->name
, O_RDONLY
);
1172 pid_t child
= pipe_fork (pipefds
, MAX_FORK_TRIES_DECOMPRESS
);
1177 if (errno
!= EMFILE
)
1178 error (SORT_FAILURE
, errno
, _("couldn't create process for %s -d"),
1179 quoteaf (compress_program
));
1185 /* Being the child of a multithreaded program before exec(),
1186 we're restricted to calling async-signal-safe routines here. */
1188 move_fd_or_die (tempfd
, STDIN_FILENO
);
1189 move_fd_or_die (pipefds
[1], STDOUT_FILENO
);
1191 execlp (compress_program
, compress_program
, "-d", (char *) NULL
);
1193 async_safe_die (errno
, "couldn't execute compress program (with -d)");
1197 register_proc (temp
);
1201 fp
= fdopen (pipefds
[0], "r");
1204 int saved_errno
= errno
;
1206 errno
= saved_errno
;
1214 /* Append DIR to the array of temporary directory names. */
1216 add_temp_dir (char const *dir
)
1218 if (temp_dir_count
== temp_dir_alloc
)
1219 temp_dirs
= X2NREALLOC (temp_dirs
, &temp_dir_alloc
);
1221 temp_dirs
[temp_dir_count
++] = dir
;
1224 /* Remove NAME from the list of temporary files. */
1227 zaptemp (char const *name
)
1229 struct tempnode
*volatile *pnode
;
1230 struct tempnode
*node
;
1231 struct tempnode
*next
;
1233 int unlink_errno
= 0;
1234 struct cs_status cs
;
1236 for (pnode
= &temphead
; (node
= *pnode
)->name
!= name
; pnode
= &node
->next
)
1239 if (node
->state
== UNREAPED
)
1240 wait_proc (node
->pid
);
1242 /* Unlink the temporary file in a critical section to avoid races. */
1245 unlink_status
= unlink (name
);
1246 unlink_errno
= errno
;
1250 if (unlink_status
!= 0)
1251 error (0, unlink_errno
, _("warning: cannot remove: %s"), quotef (name
));
1257 #if HAVE_NL_LANGINFO
1260 struct_month_cmp (void const *m1
, void const *m2
)
1262 struct month
const *month1
= m1
;
1263 struct month
const *month2
= m2
;
1264 return strcmp (month1
->name
, month2
->name
);
1269 /* Initialize the character class tables. */
1276 for (i
= 0; i
< UCHAR_LIM
; ++i
)
1278 blanks
[i
] = !! isblank (i
);
1279 nonprinting
[i
] = ! isprint (i
);
1280 nondictionary
[i
] = ! isalnum (i
) && ! isblank (i
);
1281 fold_toupper
[i
] = toupper (i
);
1284 #if HAVE_NL_LANGINFO
1285 /* If we're not in the "C" locale, read different names for months. */
1288 for (i
= 0; i
< MONTHS_PER_YEAR
; i
++)
1295 s
= nl_langinfo (ABMON_1
+ i
);
1297 monthtab
[i
].name
= name
= xmalloc (s_len
+ 1);
1298 monthtab
[i
].val
= i
+ 1;
1300 for (j
= k
= 0; j
< s_len
; j
++)
1301 if (! isblank (to_uchar (s
[j
])))
1302 name
[k
++] = fold_toupper
[to_uchar (s
[j
])];
1305 qsort (monthtab
, MONTHS_PER_YEAR
, sizeof *monthtab
, struct_month_cmp
);
1310 /* Specify how many inputs may be merged at once.
1311 This may be set on the command-line with the
1312 --batch-size option. */
1314 specify_nmerge (int oi
, char c
, char const *s
)
1317 struct rlimit rlimit
;
1318 enum strtol_error e
= xstrtoumax (s
, NULL
, 10, &n
, NULL
);
1320 /* Try to find out how many file descriptors we'll be able
1321 to open. We need at least nmerge + 3 (STDIN_FILENO,
1322 STDOUT_FILENO and STDERR_FILENO). */
1323 unsigned int max_nmerge
= ((getrlimit (RLIMIT_NOFILE
, &rlimit
) == 0
1328 if (e
== LONGINT_OK
)
1332 e
= LONGINT_OVERFLOW
;
1337 error (0, 0, _("invalid --%s argument %s"),
1338 long_options
[oi
].name
, quote (s
));
1339 error (SORT_FAILURE
, 0,
1340 _("minimum --%s argument is %s"),
1341 long_options
[oi
].name
, quote ("2"));
1343 else if (max_nmerge
< nmerge
)
1345 e
= LONGINT_OVERFLOW
;
1352 if (e
== LONGINT_OVERFLOW
)
1354 char max_nmerge_buf
[INT_BUFSIZE_BOUND (max_nmerge
)];
1355 error (0, 0, _("--%s argument %s too large"),
1356 long_options
[oi
].name
, quote (s
));
1357 error (SORT_FAILURE
, 0,
1358 _("maximum --%s argument with current rlimit is %s"),
1359 long_options
[oi
].name
,
1360 uinttostr (max_nmerge
, max_nmerge_buf
));
1363 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1366 /* Specify the amount of main memory to use when sorting. */
1368 specify_sort_size (int oi
, char c
, char const *s
)
1372 enum strtol_error e
= xstrtoumax (s
, &suffix
, 10, &n
, "EgGkKmMPtTYZ");
1374 /* The default unit is KiB. */
1375 if (e
== LONGINT_OK
&& ISDIGIT (suffix
[-1]))
1377 if (n
<= UINTMAX_MAX
/ 1024)
1380 e
= LONGINT_OVERFLOW
;
1383 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1384 if (e
== LONGINT_INVALID_SUFFIX_CHAR
&& ISDIGIT (suffix
[-1]) && ! suffix
[1])
1393 double mem
= physmem_total () * n
/ 100;
1395 /* Use "<", not "<=", to avoid problems with rounding. */
1396 if (mem
< UINTMAX_MAX
)
1402 e
= LONGINT_OVERFLOW
;
1407 if (e
== LONGINT_OK
)
1409 /* If multiple sort sizes are specified, take the maximum, so
1410 that option order does not matter. */
1417 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
1421 e
= LONGINT_OVERFLOW
;
1424 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1427 /* Specify the number of threads to spawn during internal sort. */
1429 specify_nthreads (int oi
, char c
, char const *s
)
1431 unsigned long int nthreads
;
1432 enum strtol_error e
= xstrtoul (s
, NULL
, 10, &nthreads
, "");
1433 if (e
== LONGINT_OVERFLOW
)
1435 if (e
!= LONGINT_OK
)
1436 xstrtol_fatal (e
, oi
, c
, long_options
, s
);
1437 if (SIZE_MAX
< nthreads
)
1438 nthreads
= SIZE_MAX
;
1440 error (SORT_FAILURE
, 0, _("number in parallel must be nonzero"));
1444 /* Return the default sort size. */
1446 default_sort_size (void)
1448 /* Let SIZE be MEM, but no more than the maximum object size,
1449 total memory, or system resource limits. Don't bother to check
1450 for values like RLIM_INFINITY since in practice they are not much
1451 less than SIZE_MAX. */
1452 size_t size
= SIZE_MAX
;
1453 struct rlimit rlimit
;
1454 if (getrlimit (RLIMIT_DATA
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1455 size
= rlimit
.rlim_cur
;
1457 if (getrlimit (RLIMIT_AS
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
1458 size
= rlimit
.rlim_cur
;
1461 /* Leave a large safety margin for the above limits, as failure can
1462 occur when they are exceeded. */
1466 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1467 Exceeding RSS is not fatal, but can be quite slow. */
1468 if (getrlimit (RLIMIT_RSS
, &rlimit
) == 0 && rlimit
.rlim_cur
/ 16 * 15 < size
)
1469 size
= rlimit
.rlim_cur
/ 16 * 15;
1472 /* Let MEM be available memory or 1/8 of total memory, whichever
1474 double avail
= physmem_available ();
1475 double total
= physmem_total ();
1476 double mem
= MAX (avail
, total
/ 8);
1478 /* Leave a 1/4 margin for physical memory. */
1479 if (total
* 0.75 < size
)
1480 size
= total
* 0.75;
1482 /* Return the minimum of MEM and SIZE, but no less than
1483 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1484 right when only one argument is floating point. */
1487 return MAX (size
, MIN_SORT_SIZE
);
1490 /* Return the sort buffer size to use with the input files identified
1491 by FPS and FILES, which are alternate names of the same files.
1492 NFILES gives the number of input files; NFPS may be less. Assume
1493 that each input line requires LINE_BYTES extra bytes' worth of line
1494 information. Do not exceed the size bound specified by the user
1495 (or a default size bound, if the user does not specify one). */
1498 sort_buffer_size (FILE *const *fps
, size_t nfps
,
1499 char *const *files
, size_t nfiles
,
1502 /* A bound on the input size. If zero, the bound hasn't been
1504 static size_t size_bound
;
1506 /* In the worst case, each input byte is a newline. */
1507 size_t worst_case_per_input_byte
= line_bytes
+ 1;
1509 /* Keep enough room for one extra input line and an extra byte.
1510 This extra room might be needed when preparing to read EOF. */
1511 size_t size
= worst_case_per_input_byte
+ 1;
1515 for (i
= 0; i
< nfiles
; i
++)
1521 if ((i
< nfps
? fstat (fileno (fps
[i
]), &st
)
1522 : STREQ (files
[i
], "-") ? fstat (STDIN_FILENO
, &st
)
1523 : stat (files
[i
], &st
))
1525 die (_("stat failed"), files
[i
]);
1527 if (S_ISREG (st
.st_mode
))
1528 file_size
= st
.st_size
;
1531 /* The file has unknown size. If the user specified a sort
1532 buffer size, use that; otherwise, guess the size. */
1535 file_size
= INPUT_FILE_SIZE_GUESS
;
1540 size_bound
= sort_size
;
1542 size_bound
= default_sort_size ();
1545 /* Add the amount of memory needed to represent the worst case
1546 where the input consists entirely of newlines followed by a
1547 single non-newline. Check for overflow. */
1548 worst_case
= file_size
* worst_case_per_input_byte
+ 1;
1549 if (file_size
!= worst_case
/ worst_case_per_input_byte
1550 || size_bound
- size
<= worst_case
)
1558 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1559 must be at least sizeof (struct line). Allocate ALLOC bytes
1563 initbuf (struct buffer
*buf
, size_t line_bytes
, size_t alloc
)
1565 /* Ensure that the line array is properly aligned. If the desired
1566 size cannot be allocated, repeatedly halve it until allocation
1567 succeeds. The smaller allocation may hurt overall performance,
1568 but that's better than failing. */
1571 alloc
+= sizeof (struct line
) - alloc
% sizeof (struct line
);
1572 buf
->buf
= malloc (alloc
);
1576 if (alloc
<= line_bytes
+ 1)
1580 buf
->line_bytes
= line_bytes
;
1582 buf
->used
= buf
->left
= buf
->nlines
= 0;
1586 /* Return one past the limit of the line array. */
1588 static inline struct line
*
1589 buffer_linelim (struct buffer
const *buf
)
1591 void *linelim
= buf
->buf
+ buf
->alloc
;
1595 /* Return a pointer to the first character of the field specified
1599 begfield (struct line
const *line
, struct keyfield
const *key
)
1601 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1602 size_t sword
= key
->sword
;
1603 size_t schar
= key
->schar
;
1605 /* The leading field separator itself is included in a field when -t
1608 if (tab
!= TAB_DEFAULT
)
1609 while (ptr
< lim
&& sword
--)
1611 while (ptr
< lim
&& *ptr
!= tab
)
1617 while (ptr
< lim
&& sword
--)
1619 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1621 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1625 /* If we're ignoring leading blanks when computing the Start
1626 of the field, skip past them here. */
1627 if (key
->skipsblanks
)
1628 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1631 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1632 ptr
= MIN (lim
, ptr
+ schar
);
1637 /* Return the limit of (a pointer to the first character after) the field
1638 in LINE specified by KEY. */
1641 limfield (struct line
const *line
, struct keyfield
const *key
)
1643 char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
1644 size_t eword
= key
->eword
, echar
= key
->echar
;
1647 eword
++; /* Skip all of end field. */
1649 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1650 whichever comes first. If there are more than EWORD fields, leave
1651 PTR pointing at the beginning of the field having zero-based index,
1652 EWORD. If a delimiter character was specified (via -t), then that
1653 'beginning' is the first character following the delimiting TAB.
1654 Otherwise, leave PTR pointing at the first 'blank' character after
1655 the preceding field. */
1656 if (tab
!= TAB_DEFAULT
)
1657 while (ptr
< lim
&& eword
--)
1659 while (ptr
< lim
&& *ptr
!= tab
)
1661 if (ptr
< lim
&& (eword
|| echar
))
1665 while (ptr
< lim
&& eword
--)
1667 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1669 while (ptr
< lim
&& !blanks
[to_uchar (*ptr
)])
1673 #ifdef POSIX_UNSPECIFIED
1674 /* The following block of code makes GNU sort incompatible with
1675 standard Unix sort, so it's ifdef'd out for now.
1676 The POSIX spec isn't clear on how to interpret this.
1677 FIXME: request clarification.
1679 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1680 Date: Thu, 30 May 96 12:20:41 -0400
1681 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1683 [...]I believe I've found another bug in 'sort'.
1688 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1691 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1695 Unix sort produced the answer I expected: sort on the single character
1696 in column 7. GNU sort produced different results, because it disagrees
1697 on the interpretation of the key-end spec "M.N". Unix sort reads this
1698 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1699 "skip M-1 fields, then either N-1 characters or the rest of the current
1700 field, whichever comes first". This extra clause applies only to
1701 key-ends, not key-starts.
1704 /* Make LIM point to the end of (one byte past) the current field. */
1705 if (tab
!= TAB_DEFAULT
)
1708 newlim
= memchr (ptr
, tab
, lim
- ptr
);
1716 while (newlim
< lim
&& blanks
[to_uchar (*newlim
)])
1718 while (newlim
< lim
&& !blanks
[to_uchar (*newlim
)])
1724 if (echar
!= 0) /* We need to skip over a portion of the end field. */
1726 /* If we're ignoring leading blanks when computing the End
1727 of the field, skip past them here. */
1728 if (key
->skipeblanks
)
1729 while (ptr
< lim
&& blanks
[to_uchar (*ptr
)])
1732 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1733 ptr
= MIN (lim
, ptr
+ echar
);
1739 /* Fill BUF reading from FP, moving buf->left bytes from the end
1740 of buf->buf to the beginning first. If EOF is reached and the
1741 file wasn't terminated by a newline, supply one. Set up BUF's line
1742 table too. FILE is the name of the file corresponding to FP.
1743 Return true if some input was read. */
1746 fillbuf (struct buffer
*buf
, FILE *fp
, char const *file
)
1748 struct keyfield
const *key
= keylist
;
1750 size_t line_bytes
= buf
->line_bytes
;
1751 size_t mergesize
= merge_buffer_size
- MIN_MERGE_BUFFER_SIZE
;
1756 if (buf
->used
!= buf
->left
)
1758 memmove (buf
->buf
, buf
->buf
+ buf
->used
- buf
->left
, buf
->left
);
1759 buf
->used
= buf
->left
;
1765 char *ptr
= buf
->buf
+ buf
->used
;
1766 struct line
*linelim
= buffer_linelim (buf
);
1767 struct line
*line
= linelim
- buf
->nlines
;
1768 size_t avail
= (char *) linelim
- buf
->nlines
* line_bytes
- ptr
;
1769 char *line_start
= buf
->nlines
? line
->text
+ line
->length
: buf
->buf
;
1771 while (line_bytes
+ 1 < avail
)
1773 /* Read as many bytes as possible, but do not read so many
1774 bytes that there might not be enough room for the
1775 corresponding line array. The worst case is when the
1776 rest of the input file consists entirely of newlines,
1777 except that the last byte is not a newline. */
1778 size_t readsize
= (avail
- 1) / (line_bytes
+ 1);
1779 size_t bytes_read
= fread (ptr
, 1, readsize
, fp
);
1780 char *ptrlim
= ptr
+ bytes_read
;
1782 avail
-= bytes_read
;
1784 if (bytes_read
!= readsize
)
1787 die (_("read failed"), file
);
1791 if (buf
->buf
== ptrlim
)
1793 if (line_start
!= ptrlim
&& ptrlim
[-1] != eol
)
1798 /* Find and record each line in the just-read input. */
1799 while ((p
= memchr (ptr
, eol
, ptrlim
- ptr
)))
1801 /* Delimit the line with NUL. This eliminates the need to
1802 temporarily replace the last byte with NUL when calling
1803 xmemcoll(), which increases performance. */
1807 line
->text
= line_start
;
1808 line
->length
= ptr
- line_start
;
1809 mergesize
= MAX (mergesize
, line
->length
);
1810 avail
-= line_bytes
;
1814 /* Precompute the position of the first key for
1816 line
->keylim
= (key
->eword
== SIZE_MAX
1818 : limfield (line
, key
));
1820 if (key
->sword
!= SIZE_MAX
)
1821 line
->keybeg
= begfield (line
, key
);
1824 if (key
->skipsblanks
)
1825 while (blanks
[to_uchar (*line_start
)])
1827 line
->keybeg
= line_start
;
1839 buf
->used
= ptr
- buf
->buf
;
1840 buf
->nlines
= buffer_linelim (buf
) - line
;
1841 if (buf
->nlines
!= 0)
1843 buf
->left
= ptr
- line_start
;
1844 merge_buffer_size
= mergesize
+ MIN_MERGE_BUFFER_SIZE
;
1849 /* The current input line is too long to fit in the buffer.
1850 Increase the buffer size and try again, keeping it properly
1852 size_t line_alloc
= buf
->alloc
/ sizeof (struct line
);
1853 buf
->buf
= x2nrealloc (buf
->buf
, &line_alloc
, sizeof (struct line
));
1854 buf
->alloc
= line_alloc
* sizeof (struct line
);
1859 /* Table that maps characters to order-of-magnitude values. */
1860 static char const unit_order
[UCHAR_LIM
] =
1862 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1863 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1864 /* This initializer syntax works on all C99 hosts. For now, use
1865 it only on non-ASCII hosts, to ease the pain of porting to
1866 pre-C99 ASCII hosts. */
1867 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1870 /* Generate the following table with this command:
1871 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1872 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1877 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1880 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1881 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1882 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1883 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1884 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1888 /* Return an integer that represents the order of magnitude of the
1889 unit following the number. The number may contain thousands
1890 separators and a decimal point, but it may not contain leading blanks.
1891 Negative numbers get negative orders; zero numbers have a zero order. */
1893 static int _GL_ATTRIBUTE_PURE
1894 find_unit_order (char const *number
)
1896 bool minus_sign
= (*number
== '-');
1897 char const *p
= number
+ minus_sign
;
1901 /* Scan to end of number.
1902 Decimals or separators not followed by digits stop the scan.
1903 Numbers ending in decimals or separators are thus considered
1904 to be lacking in units.
1905 FIXME: add support for multibyte thousands_sep and decimal_point. */
1909 while (ISDIGIT (ch
= *p
++))
1910 nonzero
|= ch
- '0';
1912 while (ch
== thousands_sep
);
1914 if (ch
== decimal_point
)
1915 while (ISDIGIT (ch
= *p
++))
1916 nonzero
|= ch
- '0';
1920 int order
= unit_order
[ch
];
1921 return (minus_sign
? -order
: order
);
1927 /* Compare numbers A and B ending in units with SI or IEC prefixes
1928 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1931 human_numcompare (char const *a
, char const *b
)
1933 while (blanks
[to_uchar (*a
)])
1935 while (blanks
[to_uchar (*b
)])
1938 int diff
= find_unit_order (a
) - find_unit_order (b
);
1939 return (diff
? diff
: strnumcmp (a
, b
, decimal_point
, thousands_sep
));
1942 /* Compare strings A and B as numbers without explicitly converting them to
1943 machine numbers. Comparatively slow for short strings, but asymptotically
1947 numcompare (char const *a
, char const *b
)
1949 while (blanks
[to_uchar (*a
)])
1951 while (blanks
[to_uchar (*b
)])
1954 return strnumcmp (a
, b
, decimal_point
, thousands_sep
);
1957 /* Work around a problem whereby the long double value returned by glibc's
1958 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1959 A and B before calling strtold. FIXME: remove this function once
1960 gnulib guarantees that strtold's result is always well defined. */
1962 nan_compare (char const *sa
, char const *sb
)
1965 memset (&a
, 0, sizeof a
);
1966 a
= strtold (sa
, NULL
);
1969 memset (&b
, 0, sizeof b
);
1970 b
= strtold (sb
, NULL
);
1972 return memcmp (&a
, &b
, sizeof a
);
1976 general_numcompare (char const *sa
, char const *sb
)
1978 /* FIXME: maybe add option to try expensive FP conversion
1979 only if A and B can't be compared more cheaply/accurately. */
1983 long_double a
= strtold (sa
, &ea
);
1984 long_double b
= strtold (sb
, &eb
);
1986 /* Put conversion errors at the start of the collating sequence. */
1988 return sb
== eb
? 0 : -1;
1992 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1993 conversion errors but before numbers; sort them by internal
1994 bit-pattern, for lack of a more portable alternative. */
2000 : nan_compare (sa
, sb
));
2003 /* Return an integer in 1..12 of the month name MONTH.
2004 Return 0 if the name in S is not recognized. */
2007 getmonth (char const *month
, char **ea
)
2010 size_t hi
= MONTHS_PER_YEAR
;
2012 while (blanks
[to_uchar (*month
)])
2017 size_t ix
= (lo
+ hi
) / 2;
2018 char const *m
= month
;
2019 char const *n
= monthtab
[ix
].name
;
2027 return monthtab
[ix
].val
;
2029 if (to_uchar (fold_toupper
[to_uchar (*m
)]) < to_uchar (*n
))
2034 else if (to_uchar (fold_toupper
[to_uchar (*m
)]) > to_uchar (*n
))
2046 /* A randomly chosen MD5 state, used for random comparison. */
2047 static struct md5_ctx random_md5_state
;
2049 /* Initialize the randomly chosen MD5 state. */
2052 random_md5_state_init (char const *random_source
)
2054 unsigned char buf
[MD5_DIGEST_SIZE
];
2055 struct randread_source
*r
= randread_new (random_source
, sizeof buf
);
2057 die (_("open failed"), random_source
);
2058 randread (r
, buf
, sizeof buf
);
2059 if (randread_free (r
) != 0)
2060 die (_("close failed"), random_source
);
2061 md5_init_ctx (&random_md5_state
);
2062 md5_process_bytes (buf
, sizeof buf
, &random_md5_state
);
2065 /* This is like strxfrm, except it reports any error and exits. */
2068 xstrxfrm (char *restrict dest
, char const *restrict src
, size_t destsize
)
2071 size_t translated_size
= strxfrm (dest
, src
, destsize
);
2075 error (0, errno
, _("string transformation failed"));
2076 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2077 error (SORT_FAILURE
, 0,
2078 _("the untransformed string was %s"),
2079 quotearg_n_style (0, locale_quoting_style
, src
));
2082 return translated_size
;
2085 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2086 using one or more random hash functions. TEXTA[LENA] and
2087 TEXTB[LENB] must be zero. */
2090 compare_random (char *restrict texta
, size_t lena
,
2091 char *restrict textb
, size_t lenb
)
2093 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2094 data. This is used to break ties if there is a checksum
2095 collision, and this is good enough given the astronomically low
2096 probability of a collision. */
2099 char stackbuf
[4000];
2100 char *buf
= stackbuf
;
2101 size_t bufsize
= sizeof stackbuf
;
2102 void *allocated
= NULL
;
2103 uint32_t dig
[2][MD5_DIGEST_SIZE
/ sizeof (uint32_t)];
2104 struct md5_ctx s
[2];
2105 s
[0] = s
[1] = random_md5_state
;
2107 if (hard_LC_COLLATE
)
2109 char const *lima
= texta
+ lena
;
2110 char const *limb
= textb
+ lenb
;
2114 /* Transform the text into the basis of comparison, so that byte
2115 strings that would otherwise considered to be equal are
2116 considered equal here even if their bytes differ.
2118 Each time through this loop, transform one
2119 null-terminated string's worth from TEXTA or from TEXTB
2120 or both. That way, there's no need to store the
2121 transformation of the whole line, if it contains many
2122 null-terminated strings. */
2124 /* Store the transformed data into a big-enough buffer. */
2126 /* A 3X size guess avoids the overhead of calling strxfrm
2127 twice on typical implementations. Don't worry about
2128 size_t overflow, as the guess need not be correct. */
2129 size_t guess_bufsize
= 3 * (lena
+ lenb
) + 2;
2130 if (bufsize
< guess_bufsize
)
2132 bufsize
= MAX (guess_bufsize
, bufsize
* 3 / 2);
2134 buf
= allocated
= malloc (bufsize
);
2138 bufsize
= sizeof stackbuf
;
2143 (texta
< lima
? xstrxfrm (buf
, texta
, bufsize
) + 1 : 0);
2144 bool a_fits
= sizea
<= bufsize
;
2147 ? (xstrxfrm ((a_fits
? buf
+ sizea
: NULL
), textb
,
2148 (a_fits
? bufsize
- sizea
: 0))
2152 if (! (a_fits
&& sizea
+ sizeb
<= bufsize
))
2154 bufsize
= sizea
+ sizeb
;
2155 if (bufsize
< SIZE_MAX
/ 3)
2156 bufsize
= bufsize
* 3 / 2;
2158 buf
= allocated
= xmalloc (bufsize
);
2160 strxfrm (buf
, texta
, sizea
);
2162 strxfrm (buf
+ sizea
, textb
, sizeb
);
2165 /* Advance past NULs to the next part of each input string,
2166 exiting the loop if both strings are exhausted. When
2167 exiting the loop, prepare to finish off the tiebreaker
2168 comparison properly. */
2170 texta
+= strlen (texta
) + 1;
2172 textb
+= strlen (textb
) + 1;
2173 if (! (texta
< lima
|| textb
< limb
))
2175 lena
= sizea
; texta
= buf
;
2176 lenb
= sizeb
; textb
= buf
+ sizea
;
2180 /* Accumulate the transformed data in the corresponding
2182 md5_process_bytes (buf
, sizea
, &s
[0]);
2183 md5_process_bytes (buf
+ sizea
, sizeb
, &s
[1]);
2185 /* Update the tiebreaker comparison of the transformed data. */
2188 xfrm_diff
= memcmp (buf
, buf
+ sizea
, MIN (sizea
, sizeb
));
2190 xfrm_diff
= (sizea
> sizeb
) - (sizea
< sizeb
);
2195 /* Compute and compare the checksums. */
2196 md5_process_bytes (texta
, lena
, &s
[0]); md5_finish_ctx (&s
[0], dig
[0]);
2197 md5_process_bytes (textb
, lenb
, &s
[1]); md5_finish_ctx (&s
[1], dig
[1]);
2198 int diff
= memcmp (dig
[0], dig
[1], sizeof dig
[0]);
2200 /* Fall back on the tiebreaker if the checksums collide. */
2205 xfrm_diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
2207 xfrm_diff
= (lena
> lenb
) - (lena
< lenb
);
2218 /* Return the printable width of the block of memory starting at
2219 TEXT and ending just before LIM, counting each tab as one byte.
2220 FIXME: Should we generally be counting non printable chars? */
2223 debug_width (char const *text
, char const *lim
)
2225 size_t width
= mbsnwidth (text
, lim
- text
, 0);
2227 width
+= (*text
++ == '\t');
2231 /* For debug mode, "underline" a key at the
2232 specified offset and screen width. */
2235 mark_key (size_t offset
, size_t width
)
2241 printf (_("^ no match for key\n"));
2252 /* Return true if KEY is a numeric key. */
2255 key_numeric (struct keyfield
const *key
)
2257 return key
->numeric
|| key
->general_numeric
|| key
->human_numeric
;
2260 /* For LINE, output a debugging line that underlines KEY in LINE.
2261 If KEY is null, underline the whole line. */
2264 debug_key (struct line
const *line
, struct keyfield
const *key
)
2266 char *text
= line
->text
;
2268 char *lim
= text
+ line
->length
- 1;
2272 if (key
->sword
!= SIZE_MAX
)
2273 beg
= begfield (line
, key
);
2274 if (key
->eword
!= SIZE_MAX
)
2275 lim
= limfield (line
, key
);
2277 if (key
->skipsblanks
|| key
->month
|| key_numeric (key
))
2282 while (blanks
[to_uchar (*beg
)])
2285 char *tighter_lim
= beg
;
2289 else if (key
->month
)
2290 getmonth (beg
, &tighter_lim
);
2291 else if (key
->general_numeric
)
2292 ignore_value (strtold (beg
, &tighter_lim
));
2293 else if (key
->numeric
|| key
->human_numeric
)
2295 char *p
= beg
+ (beg
< lim
&& *beg
== '-');
2296 bool found_digit
= false;
2301 while (ISDIGIT (ch
= *p
++))
2304 while (ch
== thousands_sep
);
2306 if (ch
== decimal_point
)
2307 while (ISDIGIT (ch
= *p
++))
2311 tighter_lim
= p
- ! (key
->human_numeric
&& unit_order
[ch
]);
2321 size_t offset
= debug_width (text
, beg
);
2322 size_t width
= debug_width (beg
, lim
);
2323 mark_key (offset
, width
);
2326 /* Debug LINE by underlining its keys. */
2329 debug_line (struct line
const *line
)
2331 struct keyfield
const *key
= keylist
;
2334 debug_key (line
, key
);
2335 while (key
&& ((key
= key
->next
) || ! (unique
|| stable
)));
2338 /* Return whether sorting options specified for key. */
2341 default_key_compare (struct keyfield
const *key
)
2343 return ! (key
->ignore
2347 || key_numeric (key
)
2351 /* || key->reverse */
2355 /* Convert a key to the short options used to specify it. */
2358 key_to_opts (struct keyfield
const *key
, char *opts
)
2360 if (key
->skipsblanks
|| key
->skipeblanks
)
2361 *opts
++ = 'b';/* either disables global -b */
2362 if (key
->ignore
== nondictionary
)
2366 if (key
->general_numeric
)
2368 if (key
->human_numeric
)
2370 if (key
->ignore
== nonprinting
)
2385 /* Output data independent key warnings to stderr. */
2388 key_warnings (struct keyfield
const *gkey
, bool gkey_only
)
2390 struct keyfield
const *key
;
2391 struct keyfield ugkey
= *gkey
;
2392 unsigned long keynum
= 1;
2394 for (key
= keylist
; key
; key
= key
->next
, keynum
++)
2396 if (key
->obsolete_used
)
2398 size_t sword
= key
->sword
;
2399 size_t eword
= key
->eword
;
2400 char tmp
[INT_BUFSIZE_BOUND (uintmax_t)];
2401 /* obsolescent syntax +A.x -B.y is equivalent to:
2402 -k A+1.x+1,B.y (when y = 0)
2403 -k A+1.x+1,B+1.y (when y > 0) */
2404 char obuf
[INT_BUFSIZE_BOUND (sword
) * 2 + 4]; /* +# -# */
2405 char nbuf
[INT_BUFSIZE_BOUND (sword
) * 2 + 5]; /* -k #,# */
2409 if (sword
== SIZE_MAX
)
2412 po
= stpcpy (stpcpy (po
, "+"), umaxtostr (sword
, tmp
));
2413 pn
= stpcpy (stpcpy (pn
, "-k "), umaxtostr (sword
+ 1, tmp
));
2414 if (key
->eword
!= SIZE_MAX
)
2416 stpcpy (stpcpy (po
, " -"), umaxtostr (eword
+ 1, tmp
));
2417 stpcpy (stpcpy (pn
, ","),
2418 umaxtostr (eword
+ 1
2419 + (key
->echar
== SIZE_MAX
), tmp
));
2421 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2422 quote_n (0, obuf
), quote_n (1, nbuf
));
2425 /* Warn about field specs that will never match. */
2426 if (key
->sword
!= SIZE_MAX
&& key
->eword
< key
->sword
)
2427 error (0, 0, _("key %lu has zero width and will be ignored"), keynum
);
2429 /* Warn about significant leading blanks. */
2430 bool implicit_skip
= key_numeric (key
) || key
->month
;
2431 bool maybe_space_aligned
= !hard_LC_COLLATE
&& default_key_compare (key
)
2432 && !(key
->schar
|| key
->echar
);
2433 bool line_offset
= key
->eword
== 0 && key
->echar
!= 0; /* -k1.x,1.y */
2434 if (!gkey_only
&& tab
== TAB_DEFAULT
&& !line_offset
2435 && ((!key
->skipsblanks
&& !(implicit_skip
|| maybe_space_aligned
))
2436 || (!key
->skipsblanks
&& key
->schar
)
2437 || (!key
->skipeblanks
&& key
->echar
)))
2438 error (0, 0, _("leading blanks are significant in key %lu; "
2439 "consider also specifying 'b'"), keynum
);
2441 /* Warn about numeric comparisons spanning fields,
2442 as field delimiters could be interpreted as part
2443 of the number (maybe only in other locales). */
2444 if (!gkey_only
&& key_numeric (key
))
2446 size_t sword
= key
->sword
+ 1;
2447 size_t eword
= key
->eword
+ 1;
2450 if (!eword
|| sword
< eword
)
2451 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2455 /* Flag global options not copied or specified in any key. */
2456 if (ugkey
.ignore
&& (ugkey
.ignore
== key
->ignore
))
2457 ugkey
.ignore
= NULL
;
2458 if (ugkey
.translate
&& (ugkey
.translate
== key
->translate
))
2459 ugkey
.translate
= NULL
;
2460 ugkey
.skipsblanks
&= !key
->skipsblanks
;
2461 ugkey
.skipeblanks
&= !key
->skipeblanks
;
2462 ugkey
.month
&= !key
->month
;
2463 ugkey
.numeric
&= !key
->numeric
;
2464 ugkey
.general_numeric
&= !key
->general_numeric
;
2465 ugkey
.human_numeric
&= !key
->human_numeric
;
2466 ugkey
.random
&= !key
->random
;
2467 ugkey
.version
&= !key
->version
;
2468 ugkey
.reverse
&= !key
->reverse
;
2471 /* Warn about ignored global options flagged above.
2472 Note if gkey is the only one in the list, all flags are cleared. */
2473 if (!default_key_compare (&ugkey
)
2474 || (ugkey
.reverse
&& (stable
|| unique
) && keylist
))
2476 bool ugkey_reverse
= ugkey
.reverse
;
2477 if (!(stable
|| unique
))
2478 ugkey
.reverse
= false;
2479 /* The following is too big, but guaranteed to be "big enough". */
2480 char opts
[sizeof short_options
];
2481 key_to_opts (&ugkey
, opts
);
2483 ngettext ("option '-%s' is ignored",
2484 "options '-%s' are ignored",
2485 select_plural (strlen (opts
))), opts
);
2486 ugkey
.reverse
= ugkey_reverse
;
2488 if (ugkey
.reverse
&& !(stable
|| unique
) && keylist
)
2489 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2492 /* Compare two lines A and B trying every key in sequence until there
2493 are no more keys or a difference is found. */
2496 keycompare (struct line
const *a
, struct line
const *b
)
2498 struct keyfield
*key
= keylist
;
2500 /* For the first iteration only, the key positions have been
2501 precomputed for us. */
2502 char *texta
= a
->keybeg
;
2503 char *textb
= b
->keybeg
;
2504 char *lima
= a
->keylim
;
2505 char *limb
= b
->keylim
;
2511 char const *translate
= key
->translate
;
2512 bool const *ignore
= key
->ignore
;
2514 /* Treat field ends before field starts as empty fields. */
2515 lima
= MAX (texta
, lima
);
2516 limb
= MAX (textb
, limb
);
2518 /* Find the lengths. */
2519 size_t lena
= lima
- texta
;
2520 size_t lenb
= limb
- textb
;
2522 if (hard_LC_COLLATE
|| key_numeric (key
)
2523 || key
->month
|| key
->random
|| key
->version
)
2530 char enda
IF_LINT (= 0);
2531 char endb
IF_LINT (= 0);
2532 void *allocated
IF_LINT (= NULL
);
2533 char stackbuf
[4000];
2535 if (ignore
|| translate
)
2537 /* Compute with copies of the keys, which are the result of
2538 translating or ignoring characters, and which need their
2543 /* Allocate space for copies. */
2544 size_t size
= lena
+ 1 + lenb
+ 1;
2545 if (size
<= sizeof stackbuf
)
2546 ta
= stackbuf
, allocated
= NULL
;
2548 ta
= allocated
= xmalloc (size
);
2551 /* Put into each copy a version of the key in which the
2552 requested characters are ignored or translated. */
2553 for (tlena
= i
= 0; i
< lena
; i
++)
2554 if (! (ignore
&& ignore
[to_uchar (texta
[i
])]))
2555 ta
[tlena
++] = (translate
2556 ? translate
[to_uchar (texta
[i
])]
2560 for (tlenb
= i
= 0; i
< lenb
; i
++)
2561 if (! (ignore
&& ignore
[to_uchar (textb
[i
])]))
2562 tb
[tlenb
++] = (translate
2563 ? translate
[to_uchar (textb
[i
])]
2569 /* Use the keys in-place, temporarily null-terminated. */
2570 ta
= texta
; tlena
= lena
; enda
= ta
[tlena
]; ta
[tlena
] = '\0';
2571 tb
= textb
; tlenb
= lenb
; endb
= tb
[tlenb
]; tb
[tlenb
] = '\0';
2575 diff
= numcompare (ta
, tb
);
2576 else if (key
->general_numeric
)
2577 diff
= general_numcompare (ta
, tb
);
2578 else if (key
->human_numeric
)
2579 diff
= human_numcompare (ta
, tb
);
2580 else if (key
->month
)
2581 diff
= getmonth (ta
, NULL
) - getmonth (tb
, NULL
);
2582 else if (key
->random
)
2583 diff
= compare_random (ta
, tlena
, tb
, tlenb
);
2584 else if (key
->version
)
2585 diff
= filevercmp (ta
, tb
);
2588 /* Locale-dependent string sorting. This is slower than
2589 C-locale sorting, which is implemented below. */
2591 diff
= - NONZERO (tlenb
);
2592 else if (tlenb
== 0)
2595 diff
= xmemcoll0 (ta
, tlena
+ 1, tb
, tlenb
+ 1);
2598 if (ignore
|| translate
)
2608 #define CMP_WITH_IGNORE(A, B) \
2613 while (texta < lima && ignore[to_uchar (*texta)]) \
2615 while (textb < limb && ignore[to_uchar (*textb)]) \
2617 if (! (texta < lima && textb < limb)) \
2619 diff = to_uchar (A) - to_uchar (B); \
2626 diff = (texta < lima) - (textb < limb); \
2631 CMP_WITH_IGNORE (translate
[to_uchar (*texta
)],
2632 translate
[to_uchar (*textb
)]);
2634 CMP_WITH_IGNORE (*texta
, *textb
);
2637 diff
= - NONZERO (lenb
);
2644 while (texta
< lima
&& textb
< limb
)
2646 diff
= (to_uchar (translate
[to_uchar (*texta
++)])
2647 - to_uchar (translate
[to_uchar (*textb
++)]));
2654 diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
2658 diff
= lena
< lenb
? -1 : lena
!= lenb
;
2668 /* Find the beginning and limit of the next field. */
2669 if (key
->eword
!= SIZE_MAX
)
2670 lima
= limfield (a
, key
), limb
= limfield (b
, key
);
2672 lima
= a
->text
+ a
->length
- 1, limb
= b
->text
+ b
->length
- 1;
2674 if (key
->sword
!= SIZE_MAX
)
2675 texta
= begfield (a
, key
), textb
= begfield (b
, key
);
2678 texta
= a
->text
, textb
= b
->text
;
2679 if (key
->skipsblanks
)
2681 while (texta
< lima
&& blanks
[to_uchar (*texta
)])
2683 while (textb
< limb
&& blanks
[to_uchar (*textb
)])
2694 return key
->reverse
? -diff
: diff
;
2697 /* Compare two lines A and B, returning negative, zero, or positive
2698 depending on whether A compares less than, equal to, or greater than B. */
2701 compare (struct line
const *a
, struct line
const *b
)
2706 /* First try to compare on the specified keys (if any).
2707 The only two cases with no key at all are unadorned sort,
2708 and unadorned sort -r. */
2711 diff
= keycompare (a
, b
);
2712 if (diff
|| unique
|| stable
)
2716 /* If the keys all compare equal (or no keys were specified)
2717 fall through to the default comparison. */
2718 alen
= a
->length
- 1, blen
= b
->length
- 1;
2721 diff
= - NONZERO (blen
);
2724 else if (hard_LC_COLLATE
)
2726 /* Note xmemcoll0 is a performance enhancement as
2727 it will not unconditionally write '\0' after the
2728 passed in buffers, which was seen to give around
2729 a 3% increase in performance for short lines. */
2730 diff
= xmemcoll0 (a
->text
, alen
+ 1, b
->text
, blen
+ 1);
2732 else if (! (diff
= memcmp (a
->text
, b
->text
, MIN (alen
, blen
))))
2733 diff
= alen
< blen
? -1 : alen
!= blen
;
2735 return reverse
? -diff
: diff
;
2738 /* Write LINE to output stream FP; the output file's name is
2739 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2740 otherwise. If debugging is enabled and FP is standard output,
2741 append some debugging information. */
2744 write_line (struct line
const *line
, FILE *fp
, char const *output_file
)
2746 char *buf
= line
->text
;
2747 size_t n_bytes
= line
->length
;
2748 char *ebuf
= buf
+ n_bytes
;
2750 if (!output_file
&& debug
)
2752 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2753 char const *c
= buf
;
2762 if (fputc (wc
, fp
) == EOF
)
2763 die (_("write failed"), output_file
);
2771 if (fwrite (buf
, 1, n_bytes
, fp
) != n_bytes
)
2772 die (_("write failed"), output_file
);
2777 /* Check that the lines read from FILE_NAME come in order. Return
2778 true if they are in order. If CHECKONLY == 'c', also print a
2779 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2780 they are not in order. */
2783 check (char const *file_name
, char checkonly
)
2785 FILE *fp
= xfopen (file_name
, "r");
2786 struct buffer buf
; /* Input buffer. */
2787 struct line temp
; /* Copy of previous line. */
2789 uintmax_t line_number
= 0;
2790 struct keyfield
const *key
= keylist
;
2791 bool nonunique
= ! unique
;
2792 bool ordered
= true;
2794 initbuf (&buf
, sizeof (struct line
),
2795 MAX (merge_buffer_size
, sort_size
));
2798 while (fillbuf (&buf
, fp
, file_name
))
2800 struct line
const *line
= buffer_linelim (&buf
);
2801 struct line
const *linebase
= line
- buf
.nlines
;
2803 /* Make sure the line saved from the old buffer contents is
2804 less than or equal to the first line of the new buffer. */
2805 if (alloc
&& nonunique
<= compare (&temp
, line
- 1))
2809 if (checkonly
== 'c')
2811 struct line
const *disorder_line
= line
- 1;
2812 uintmax_t disorder_line_number
=
2813 buffer_linelim (&buf
) - disorder_line
+ line_number
;
2814 char hr_buf
[INT_BUFSIZE_BOUND (disorder_line_number
)];
2815 fprintf (stderr
, _("%s: %s:%s: disorder: "),
2816 program_name
, file_name
,
2817 umaxtostr (disorder_line_number
, hr_buf
));
2818 write_line (disorder_line
, stderr
, _("standard error"));
2826 /* Compare each line in the buffer with its successor. */
2827 while (linebase
< --line
)
2828 if (nonunique
<= compare (line
, line
- 1))
2829 goto found_disorder
;
2831 line_number
+= buf
.nlines
;
2833 /* Save the last line of the buffer. */
2834 if (alloc
< line
->length
)
2841 alloc
= line
->length
;
2845 while (alloc
< line
->length
);
2848 temp
.text
= xmalloc (alloc
);
2850 memcpy (temp
.text
, line
->text
, line
->length
);
2851 temp
.length
= line
->length
;
2854 temp
.keybeg
= temp
.text
+ (line
->keybeg
- line
->text
);
2855 temp
.keylim
= temp
.text
+ (line
->keylim
- line
->text
);
2859 xfclose (fp
, file_name
);
2865 /* Open FILES (there are NFILES of them) and store the resulting array
2866 of stream pointers into (*PFPS). Allocate the array. Return the
2867 number of successfully opened files, setting errno if this value is
2868 less than NFILES. */
2871 open_input_files (struct sortfile
*files
, size_t nfiles
, FILE ***pfps
)
2873 FILE **fps
= *pfps
= xnmalloc (nfiles
, sizeof *fps
);
2876 /* Open as many input files as we can. */
2877 for (i
= 0; i
< nfiles
; i
++)
2879 fps
[i
] = (files
[i
].temp
&& files
[i
].temp
->state
!= UNCOMPRESSED
2880 ? open_temp (files
[i
].temp
)
2881 : stream_open (files
[i
].name
, "r"));
2889 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2890 files (all of which are at the start of the FILES array), and
2891 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2892 FPS is the vector of open stream corresponding to the files.
2893 Close input and output streams before returning.
2894 OUTPUT_FILE gives the name of the output file. If it is NULL,
2895 the output file is standard output. */
2898 mergefps (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
2899 FILE *ofp
, char const *output_file
, FILE **fps
)
2901 struct buffer
*buffer
= xnmalloc (nfiles
, sizeof *buffer
);
2902 /* Input buffers for each file. */
2903 struct line saved
; /* Saved line storage for unique check. */
2904 struct line
const *savedline
= NULL
;
2905 /* &saved if there is a saved line. */
2906 size_t savealloc
= 0; /* Size allocated for the saved line. */
2907 struct line
const **cur
= xnmalloc (nfiles
, sizeof *cur
);
2908 /* Current line in each line table. */
2909 struct line
const **base
= xnmalloc (nfiles
, sizeof *base
);
2910 /* Base of each line table. */
2911 size_t *ord
= xnmalloc (nfiles
, sizeof *ord
);
2912 /* Table representing a permutation of fps,
2913 such that cur[ord[0]] is the smallest line
2914 and will be next output. */
2918 struct keyfield
const *key
= keylist
;
2921 /* Read initial lines from each input file. */
2922 for (i
= 0; i
< nfiles
; )
2924 initbuf (&buffer
[i
], sizeof (struct line
),
2925 MAX (merge_buffer_size
, sort_size
/ nfiles
));
2926 if (fillbuf (&buffer
[i
], fps
[i
], files
[i
].name
))
2928 struct line
const *linelim
= buffer_linelim (&buffer
[i
]);
2929 cur
[i
] = linelim
- 1;
2930 base
[i
] = linelim
- buffer
[i
].nlines
;
2935 /* fps[i] is empty; eliminate it from future consideration. */
2936 xfclose (fps
[i
], files
[i
].name
);
2940 zaptemp (files
[i
].name
);
2942 free (buffer
[i
].buf
);
2944 for (j
= i
; j
< nfiles
; ++j
)
2946 files
[j
] = files
[j
+ 1];
2947 fps
[j
] = fps
[j
+ 1];
2952 /* Set up the ord table according to comparisons among input lines.
2953 Since this only reorders two items if one is strictly greater than
2954 the other, it is stable. */
2955 for (i
= 0; i
< nfiles
; ++i
)
2957 for (i
= 1; i
< nfiles
; ++i
)
2958 if (0 < compare (cur
[ord
[i
- 1]], cur
[ord
[i
]]))
2959 t
= ord
[i
- 1], ord
[i
- 1] = ord
[i
], ord
[i
] = t
, i
= 0;
2961 /* Repeatedly output the smallest line until no input remains. */
2964 struct line
const *smallest
= cur
[ord
[0]];
2966 /* If uniquified output is turned on, output only the first of
2967 an identical series of lines. */
2970 if (savedline
&& compare (savedline
, smallest
))
2973 write_line (&saved
, ofp
, output_file
);
2978 if (savealloc
< smallest
->length
)
2983 savealloc
= smallest
->length
;
2986 while ((savealloc
*= 2) < smallest
->length
);
2989 saved
.text
= xmalloc (savealloc
);
2991 saved
.length
= smallest
->length
;
2992 memcpy (saved
.text
, smallest
->text
, saved
.length
);
2996 saved
.text
+ (smallest
->keybeg
- smallest
->text
);
2998 saved
.text
+ (smallest
->keylim
- smallest
->text
);
3003 write_line (smallest
, ofp
, output_file
);
3005 /* Check if we need to read more lines into core. */
3006 if (base
[ord
[0]] < smallest
)
3007 cur
[ord
[0]] = smallest
- 1;
3010 if (fillbuf (&buffer
[ord
[0]], fps
[ord
[0]], files
[ord
[0]].name
))
3012 struct line
const *linelim
= buffer_linelim (&buffer
[ord
[0]]);
3013 cur
[ord
[0]] = linelim
- 1;
3014 base
[ord
[0]] = linelim
- buffer
[ord
[0]].nlines
;
3018 /* We reached EOF on fps[ord[0]]. */
3019 for (i
= 1; i
< nfiles
; ++i
)
3020 if (ord
[i
] > ord
[0])
3023 xfclose (fps
[ord
[0]], files
[ord
[0]].name
);
3024 if (ord
[0] < ntemps
)
3027 zaptemp (files
[ord
[0]].name
);
3029 free (buffer
[ord
[0]].buf
);
3030 for (i
= ord
[0]; i
< nfiles
; ++i
)
3032 fps
[i
] = fps
[i
+ 1];
3033 files
[i
] = files
[i
+ 1];
3034 buffer
[i
] = buffer
[i
+ 1];
3035 cur
[i
] = cur
[i
+ 1];
3036 base
[i
] = base
[i
+ 1];
3038 for (i
= 0; i
< nfiles
; ++i
)
3039 ord
[i
] = ord
[i
+ 1];
3044 /* The new line just read in may be larger than other lines
3045 already in main memory; push it back in the queue until we
3046 encounter a line larger than it. Optimize for the common
3047 case where the new line is smallest. */
3052 size_t ord0
= ord
[0];
3053 size_t count_of_smaller_lines
;
3057 int cmp
= compare (cur
[ord0
], cur
[ord
[probe
]]);
3058 if (cmp
< 0 || (cmp
== 0 && ord0
< ord
[probe
]))
3062 probe
= (lo
+ hi
) / 2;
3065 count_of_smaller_lines
= lo
- 1;
3066 for (j
= 0; j
< count_of_smaller_lines
; j
++)
3067 ord
[j
] = ord
[j
+ 1];
3068 ord
[count_of_smaller_lines
] = ord0
;
3072 if (unique
&& savedline
)
3074 write_line (&saved
, ofp
, output_file
);
3078 xfclose (ofp
, output_file
);
3086 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3087 files (all of which are at the start of the FILES array), and
3088 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3089 Close input and output files before returning.
3090 OUTPUT_FILE gives the name of the output file.
3092 Return the number of files successfully merged. This number can be
3093 less than NFILES if we ran low on file descriptors, but in this
3094 case it is never less than 2. */
3097 mergefiles (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3098 FILE *ofp
, char const *output_file
)
3101 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
3102 if (nopened
< nfiles
&& nopened
< 2)
3103 die (_("open failed"), files
[nopened
].name
);
3104 mergefps (files
, ntemps
, nopened
, ofp
, output_file
, fps
);
3108 /* Merge into T (of size NLINES) the two sorted arrays of lines
3109 LO (with NLINES / 2 members), and
3110 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3111 T and LO point just past their respective arrays, and the arrays
3112 are in reverse order. NLINES must be at least 2. */
3115 mergelines (struct line
*restrict t
, size_t nlines
,
3116 struct line
const *restrict lo
)
3118 size_t nlo
= nlines
/ 2;
3119 size_t nhi
= nlines
- nlo
;
3120 struct line
*hi
= t
- nlo
;
3123 if (compare (lo
- 1, hi
- 1) <= 0)
3128 /* HI must equal T now, and there is no need to copy from
3147 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3148 Do this all within one thread. NLINES must be at least 2.
3149 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3150 Otherwise the sort is in-place and TEMP is half-sized.
3151 The input and output arrays are in reverse order, and LINES and
3152 TEMP point just past the end of their respective arrays.
3154 Use a recursive divide-and-conquer algorithm, in the style
3155 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3156 the optimization suggested by exercise 5.2.4-10; this requires room
3157 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3158 writes that this memory optimization was originally published by
3159 D. A. Bell, Comp J. 1 (1958), 75. */
3162 sequential_sort (struct line
*restrict lines
, size_t nlines
,
3163 struct line
*restrict temp
, bool to_temp
)
3167 /* Declare 'swap' as int, not bool, to work around a bug
3168 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3169 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3170 int swap
= (0 < compare (&lines
[-1], &lines
[-2]));
3173 temp
[-1] = lines
[-1 - swap
];
3174 temp
[-2] = lines
[-2 + swap
];
3178 temp
[-1] = lines
[-1];
3179 lines
[-1] = lines
[-2];
3180 lines
[-2] = temp
[-1];
3185 size_t nlo
= nlines
/ 2;
3186 size_t nhi
= nlines
- nlo
;
3187 struct line
*lo
= lines
;
3188 struct line
*hi
= lines
- nlo
;
3190 sequential_sort (hi
, nhi
, temp
- (to_temp
? nlo
: 0), to_temp
);
3192 sequential_sort (lo
, nlo
, temp
, !to_temp
);
3197 struct line
const *sorted_lo
;
3208 mergelines (dest
, nlines
, sorted_lo
);
3212 static struct merge_node
*init_node (struct merge_node
*restrict
,
3213 struct merge_node
*restrict
,
3214 struct line
*, size_t, size_t, bool);
3217 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3218 lines, with destination DEST. */
3219 static struct merge_node
*
3220 merge_tree_init (size_t nthreads
, size_t nlines
, struct line
*dest
)
3222 struct merge_node
*merge_tree
= xmalloc (2 * sizeof *merge_tree
* nthreads
);
3224 struct merge_node
*root
= merge_tree
;
3225 root
->lo
= root
->hi
= root
->end_lo
= root
->end_hi
= NULL
;
3227 root
->nlo
= root
->nhi
= nlines
;
3228 root
->parent
= NULL
;
3229 root
->level
= MERGE_END
;
3230 root
->queued
= false;
3231 pthread_mutex_init (&root
->lock
, NULL
);
3233 init_node (root
, root
+ 1, dest
, nthreads
, nlines
, false);
3237 /* Destroy the merge tree. */
3239 merge_tree_destroy (size_t nthreads
, struct merge_node
*merge_tree
)
3241 size_t n_nodes
= nthreads
* 2;
3242 struct merge_node
*node
= merge_tree
;
3246 pthread_mutex_destroy (&node
->lock
);
3253 /* Initialize a merge tree node and its descendants. The node's
3254 parent is PARENT. The node and its descendants are taken from the
3255 array of nodes NODE_POOL. Their destination starts at DEST; they
3256 will consume NTHREADS threads. The total number of sort lines is
3257 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3260 static struct merge_node
*
3261 init_node (struct merge_node
*restrict parent
,
3262 struct merge_node
*restrict node_pool
,
3263 struct line
*dest
, size_t nthreads
,
3264 size_t total_lines
, bool is_lo_child
)
3266 size_t nlines
= (is_lo_child
? parent
->nlo
: parent
->nhi
);
3267 size_t nlo
= nlines
/ 2;
3268 size_t nhi
= nlines
- nlo
;
3269 struct line
*lo
= dest
- total_lines
;
3270 struct line
*hi
= lo
- nlo
;
3271 struct line
**parent_end
= (is_lo_child
? &parent
->end_lo
: &parent
->end_hi
);
3273 struct merge_node
*node
= node_pool
++;
3274 node
->lo
= node
->end_lo
= lo
;
3275 node
->hi
= node
->end_hi
= hi
;
3276 node
->dest
= parent_end
;
3279 node
->parent
= parent
;
3280 node
->level
= parent
->level
+ 1;
3281 node
->queued
= false;
3282 pthread_mutex_init (&node
->lock
, NULL
);
3286 size_t lo_threads
= nthreads
/ 2;
3287 size_t hi_threads
= nthreads
- lo_threads
;
3288 node
->lo_child
= node_pool
;
3289 node_pool
= init_node (node
, node_pool
, lo
, lo_threads
,
3291 node
->hi_child
= node_pool
;
3292 node_pool
= init_node (node
, node_pool
, hi
, hi_threads
,
3293 total_lines
, false);
3297 node
->lo_child
= NULL
;
3298 node
->hi_child
= NULL
;
3304 /* Compare two merge nodes A and B for priority. */
3307 compare_nodes (void const *a
, void const *b
)
3309 struct merge_node
const *nodea
= a
;
3310 struct merge_node
const *nodeb
= b
;
3311 if (nodea
->level
== nodeb
->level
)
3312 return (nodea
->nlo
+ nodea
->nhi
) < (nodeb
->nlo
+ nodeb
->nhi
);
3313 return nodea
->level
< nodeb
->level
;
3316 /* Lock a merge tree NODE. */
3319 lock_node (struct merge_node
*node
)
3321 pthread_mutex_lock (&node
->lock
);
3324 /* Unlock a merge tree NODE. */
3327 unlock_node (struct merge_node
*node
)
3329 pthread_mutex_unlock (&node
->lock
);
3332 /* Destroy merge QUEUE. */
3335 queue_destroy (struct merge_node_queue
*queue
)
3337 heap_free (queue
->priority_queue
);
3338 pthread_cond_destroy (&queue
->cond
);
3339 pthread_mutex_destroy (&queue
->mutex
);
3342 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3343 NTHREADS threads. */
3346 queue_init (struct merge_node_queue
*queue
, size_t nthreads
)
3348 /* Though it's highly unlikely all nodes are in the heap at the same
3349 time, the heap should accommodate all of them. Counting a NULL
3350 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3351 queue
->priority_queue
= heap_alloc (compare_nodes
, 2 * nthreads
);
3352 pthread_mutex_init (&queue
->mutex
, NULL
);
3353 pthread_cond_init (&queue
->cond
, NULL
);
3356 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3357 does not need to lock NODE. */
3360 queue_insert (struct merge_node_queue
*queue
, struct merge_node
*node
)
3362 pthread_mutex_lock (&queue
->mutex
);
3363 heap_insert (queue
->priority_queue
, node
);
3364 node
->queued
= true;
3365 pthread_cond_signal (&queue
->cond
);
3366 pthread_mutex_unlock (&queue
->mutex
);
3369 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3371 static struct merge_node
*
3372 queue_pop (struct merge_node_queue
*queue
)
3374 struct merge_node
*node
;
3375 pthread_mutex_lock (&queue
->mutex
);
3376 while (! (node
= heap_remove_top (queue
->priority_queue
)))
3377 pthread_cond_wait (&queue
->cond
, &queue
->mutex
);
3378 pthread_mutex_unlock (&queue
->mutex
);
3380 node
->queued
= false;
3384 /* Output LINE to TFP, unless -u is specified and the line compares
3385 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3386 is null if TFP is standard output.
3388 This function does not save the line for comparison later, so it is
3389 appropriate only for internal sort. */
3392 write_unique (struct line
const *line
, FILE *tfp
, char const *temp_output
)
3396 if (saved_line
.text
&& ! compare (line
, &saved_line
))
3401 write_line (line
, tfp
, temp_output
);
3404 /* Merge the lines currently available to a NODE in the binary
3405 merge tree. Merge a number of lines appropriate for this merge
3406 level, assuming TOTAL_LINES is the total number of lines.
3408 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3409 the name of TFP, or is null if TFP is standard output. */
3412 mergelines_node (struct merge_node
*restrict node
, size_t total_lines
,
3413 FILE *tfp
, char const *temp_output
)
3415 struct line
*lo_orig
= node
->lo
;
3416 struct line
*hi_orig
= node
->hi
;
3417 size_t to_merge
= MAX_MERGE (total_lines
, node
->level
);
3421 if (node
->level
> MERGE_ROOT
)
3423 /* Merge to destination buffer. */
3424 struct line
*dest
= *node
->dest
;
3425 while (node
->lo
!= node
->end_lo
&& node
->hi
!= node
->end_hi
&& to_merge
--)
3426 if (compare (node
->lo
- 1, node
->hi
- 1) <= 0)
3427 *--dest
= *--node
->lo
;
3429 *--dest
= *--node
->hi
;
3431 merged_lo
= lo_orig
- node
->lo
;
3432 merged_hi
= hi_orig
- node
->hi
;
3434 if (node
->nhi
== merged_hi
)
3435 while (node
->lo
!= node
->end_lo
&& to_merge
--)
3436 *--dest
= *--node
->lo
;
3437 else if (node
->nlo
== merged_lo
)
3438 while (node
->hi
!= node
->end_hi
&& to_merge
--)
3439 *--dest
= *--node
->hi
;
3444 /* Merge directly to output. */
3445 while (node
->lo
!= node
->end_lo
&& node
->hi
!= node
->end_hi
&& to_merge
--)
3447 if (compare (node
->lo
- 1, node
->hi
- 1) <= 0)
3448 write_unique (--node
->lo
, tfp
, temp_output
);
3450 write_unique (--node
->hi
, tfp
, temp_output
);
3453 merged_lo
= lo_orig
- node
->lo
;
3454 merged_hi
= hi_orig
- node
->hi
;
3456 if (node
->nhi
== merged_hi
)
3458 while (node
->lo
!= node
->end_lo
&& to_merge
--)
3459 write_unique (--node
->lo
, tfp
, temp_output
);
3461 else if (node
->nlo
== merged_lo
)
3463 while (node
->hi
!= node
->end_hi
&& to_merge
--)
3464 write_unique (--node
->hi
, tfp
, temp_output
);
3469 merged_lo
= lo_orig
- node
->lo
;
3470 merged_hi
= hi_orig
- node
->hi
;
3471 node
->nlo
-= merged_lo
;
3472 node
->nhi
-= merged_hi
;
3475 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3476 NODE's children has available lines and the other either has
3477 available lines or has exhausted its lines. */
3480 queue_check_insert (struct merge_node_queue
*queue
, struct merge_node
*node
)
3484 bool lo_avail
= (node
->lo
- node
->end_lo
) != 0;
3485 bool hi_avail
= (node
->hi
- node
->end_hi
) != 0;
3486 if (lo_avail
? hi_avail
|| ! node
->nhi
: hi_avail
&& ! node
->nlo
)
3487 queue_insert (queue
, node
);
3491 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3494 queue_check_insert_parent (struct merge_node_queue
*queue
,
3495 struct merge_node
*node
)
3497 if (node
->level
> MERGE_ROOT
)
3499 lock_node (node
->parent
);
3500 queue_check_insert (queue
, node
->parent
);
3501 unlock_node (node
->parent
);
3503 else if (node
->nlo
+ node
->nhi
== 0)
3505 /* If the MERGE_ROOT NODE has finished merging, insert the
3507 queue_insert (queue
, node
->parent
);
3511 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3512 some of those lines, until the MERGE_END node is popped.
3513 TOTAL_LINES is the total number of lines. If merging at the top
3514 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3515 null if TFP is standard output. */
3518 merge_loop (struct merge_node_queue
*queue
,
3519 size_t total_lines
, FILE *tfp
, char const *temp_output
)
3523 struct merge_node
*node
= queue_pop (queue
);
3525 if (node
->level
== MERGE_END
)
3528 /* Reinsert so other threads can pop it. */
3529 queue_insert (queue
, node
);
3532 mergelines_node (node
, total_lines
, tfp
, temp_output
);
3533 queue_check_insert (queue
, node
);
3534 queue_check_insert_parent (queue
, node
);
3541 static void sortlines (struct line
*restrict
, size_t, size_t,
3542 struct merge_node
*, struct merge_node_queue
*,
3543 FILE *, char const *);
3545 /* Thread arguments for sortlines_thread. */
3549 /* Source, i.e., the array of lines to sort. This points just past
3550 the end of the array. */
3553 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3556 /* Number of lines in LINES and DEST. */
3557 size_t const total_lines
;
3559 /* Merge node. Lines from this node and this node's sibling will merged
3560 to this node's parent. */
3561 struct merge_node
*const node
;
3563 /* The priority queue controlling available work for the entire
3565 struct merge_node_queue
*const queue
;
3567 /* If at the top level, the file to output to, and the file's name.
3568 If the file is standard output, the file's name is null. */
3570 char const *output_temp
;
3573 /* Like sortlines, except with a signature acceptable to pthread_create. */
3576 sortlines_thread (void *data
)
3578 struct thread_args
const *args
= data
;
3579 sortlines (args
->lines
, args
->nthreads
, args
->total_lines
,
3580 args
->node
, args
->queue
, args
->tfp
,
3585 /* Sort lines, possibly in parallel. The arguments are as in struct
3588 The algorithm has three phases: node creation, sequential sort,
3591 During node creation, sortlines recursively visits each node in the
3592 binary merge tree and creates a NODE structure corresponding to all the
3593 future line merging NODE is responsible for. For each call to
3594 sortlines, half the available threads are assigned to each recursive
3595 call, until a leaf node having only 1 available thread is reached.
3597 Each leaf node then performs two sequential sorts, one on each half of
3598 the lines it is responsible for. It records in its NODE structure that
3599 there are two sorted sublists available to merge from, and inserts its
3600 NODE into the priority queue.
3602 The binary merge phase then begins. Each thread drops into a loop
3603 where the thread retrieves a NODE from the priority queue, merges lines
3604 available to that NODE, and potentially insert NODE or its parent back
3605 into the queue if there are sufficient available lines for them to
3606 merge. This continues until all lines at all nodes of the merge tree
3607 have been merged. */
3610 sortlines (struct line
*restrict lines
, size_t nthreads
,
3611 size_t total_lines
, struct merge_node
*node
,
3612 struct merge_node_queue
*queue
, FILE *tfp
, char const *temp_output
)
3614 size_t nlines
= node
->nlo
+ node
->nhi
;
3616 /* Calculate thread arguments. */
3617 size_t lo_threads
= nthreads
/ 2;
3618 size_t hi_threads
= nthreads
- lo_threads
;
3620 struct thread_args args
= {lines
, lo_threads
, total_lines
,
3621 node
->lo_child
, queue
, tfp
, temp_output
};
3623 if (nthreads
> 1 && SUBTHREAD_LINES_HEURISTIC
<= nlines
3624 && pthread_create (&thread
, NULL
, sortlines_thread
, &args
) == 0)
3626 sortlines (lines
- node
->nlo
, hi_threads
, total_lines
,
3627 node
->hi_child
, queue
, tfp
, temp_output
);
3628 pthread_join (thread
, NULL
);
3632 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3633 Sort with 1 thread. */
3634 size_t nlo
= node
->nlo
;
3635 size_t nhi
= node
->nhi
;
3636 struct line
*temp
= lines
- total_lines
;
3638 sequential_sort (lines
- nlo
, nhi
, temp
- nlo
/ 2, false);
3640 sequential_sort (lines
, nlo
, temp
, false);
3642 /* Update merge NODE. No need to lock yet. */
3644 node
->hi
= lines
- nlo
;
3645 node
->end_lo
= lines
- nlo
;
3646 node
->end_hi
= lines
- nlo
- nhi
;
3648 queue_insert (queue
, node
);
3649 merge_loop (queue
, total_lines
, tfp
, temp_output
);
3653 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3654 the same as OUTFILE. If found, replace each with the same
3655 temporary copy that can be merged into OUTFILE without destroying
3656 OUTFILE before it is completely read. This temporary copy does not
3657 count as a merge temp, so don't worry about incrementing NTEMPS in
3658 the caller; final cleanup will remove it, not zaptemp.
3660 This test ensures that an otherwise-erroneous use like
3661 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3662 It's not clear that POSIX requires this nicety.
3663 Detect common error cases, but don't try to catch obscure cases like
3664 "cat ... FILE ... | sort -m -o FILE"
3665 where traditional "sort" doesn't copy the input and where
3666 people should know that they're getting into trouble anyway.
3667 Catching these obscure cases would slow down performance in
3671 avoid_trashing_input (struct sortfile
*files
, size_t ntemps
,
3672 size_t nfiles
, char const *outfile
)
3675 bool got_outstat
= false;
3676 struct stat outstat
;
3677 struct tempnode
*tempcopy
= NULL
;
3679 for (i
= ntemps
; i
< nfiles
; i
++)
3681 bool is_stdin
= STREQ (files
[i
].name
, "-");
3685 if (outfile
&& STREQ (outfile
, files
[i
].name
) && !is_stdin
)
3691 if (fstat (STDOUT_FILENO
, &outstat
) != 0)
3697 ? fstat (STDIN_FILENO
, &instat
)
3698 : stat (files
[i
].name
, &instat
))
3700 && SAME_INODE (instat
, outstat
));
3708 tempcopy
= create_temp (&tftp
);
3709 mergefiles (&files
[i
], 0, 1, tftp
, tempcopy
->name
);
3712 files
[i
].name
= tempcopy
->name
;
3713 files
[i
].temp
= tempcopy
;
3718 /* Scan the input files to ensure all are accessible.
3719 Otherwise exit with a diagnostic.
3721 Note this will catch common issues with permissions etc.
3722 but will fail to notice issues where you can open() but not read(),
3723 like when a directory is specified on some systems.
3724 Catching these obscure cases could slow down performance in
3728 check_inputs (char *const *files
, size_t nfiles
)
3731 for (i
= 0; i
< nfiles
; i
++)
3733 if (STREQ (files
[i
], "-"))
3736 if (euidaccess (files
[i
], R_OK
) != 0)
3737 die (_("cannot read"), files
[i
]);
3741 /* Ensure a specified output file can be created or written to,
3742 and point stdout to it. Do not truncate the file.
3743 Exit with a diagnostic on failure. */
3746 check_output (char const *outfile
)
3750 int outfd
= open (outfile
, O_WRONLY
| O_CREAT
| O_BINARY
, MODE_RW_UGO
);
3752 die (_("open failed"), outfile
);
3753 move_fd_or_die (outfd
, STDOUT_FILENO
);
3757 /* Merge the input FILES. NTEMPS is the number of files at the
3758 start of FILES that are temporary; it is zero at the top level.
3759 NFILES is the total number of files. Put the output in
3760 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3763 merge (struct sortfile
*files
, size_t ntemps
, size_t nfiles
,
3764 char const *output_file
)
3766 while (nmerge
< nfiles
)
3768 /* Number of input files processed so far. */
3771 /* Number of output files generated so far. */
3774 /* nfiles % NMERGE; this counts input files that are left over
3775 after all full-sized merges have been done. */
3778 /* Number of easily-available slots at the next loop iteration. */
3781 /* Do as many NMERGE-size merges as possible. In the case that
3782 nmerge is bogus, increment by the maximum number of file
3783 descriptors allowed. */
3784 for (out
= in
= 0; nmerge
<= nfiles
- in
; out
++)
3787 struct tempnode
*temp
= create_temp (&tfp
);
3788 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nmerge
),
3789 nmerge
, tfp
, temp
->name
);
3790 ntemps
-= MIN (ntemps
, num_merged
);
3791 files
[out
].name
= temp
->name
;
3792 files
[out
].temp
= temp
;
3796 remainder
= nfiles
- in
;
3797 cheap_slots
= nmerge
- out
% nmerge
;
3799 if (cheap_slots
< remainder
)
3801 /* So many files remain that they can't all be put into the last
3802 NMERGE-sized output window. Do one more merge. Merge as few
3803 files as possible, to avoid needless I/O. */
3804 size_t nshortmerge
= remainder
- cheap_slots
+ 1;
3806 struct tempnode
*temp
= create_temp (&tfp
);
3807 size_t num_merged
= mergefiles (&files
[in
], MIN (ntemps
, nshortmerge
),
3808 nshortmerge
, tfp
, temp
->name
);
3809 ntemps
-= MIN (ntemps
, num_merged
);
3810 files
[out
].name
= temp
->name
;
3811 files
[out
++].temp
= temp
;
3815 /* Put the remaining input files into the last NMERGE-sized output
3816 window, so they will be merged in the next pass. */
3817 memmove (&files
[out
], &files
[in
], (nfiles
- in
) * sizeof *files
);
3822 avoid_trashing_input (files
, ntemps
, nfiles
, output_file
);
3824 /* We aren't guaranteed that this final mergefiles will work, therefore we
3825 try to merge into the output, and then merge as much as we can into a
3826 temp file if we can't. Repeat. */
3830 /* Merge directly into the output file if possible. */
3832 size_t nopened
= open_input_files (files
, nfiles
, &fps
);
3834 if (nopened
== nfiles
)
3836 FILE *ofp
= stream_open (output_file
, "w");
3839 mergefps (files
, ntemps
, nfiles
, ofp
, output_file
, fps
);
3842 if (errno
!= EMFILE
|| nopened
<= 2)
3843 die (_("open failed"), output_file
);
3845 else if (nopened
<= 2)
3846 die (_("open failed"), files
[nopened
].name
);
3848 /* We ran out of file descriptors. Close one of the input
3849 files, to gain a file descriptor. Then create a temporary
3850 file with our spare file descriptor. Retry if that failed
3851 (e.g., some other process could open a file between the time
3852 we closed and tried to create). */
3854 struct tempnode
*temp
;
3858 xfclose (fps
[nopened
], files
[nopened
].name
);
3859 temp
= maybe_create_temp (&tfp
, ! (nopened
<= 2));
3863 /* Merge into the newly allocated temporary. */
3864 mergefps (&files
[0], MIN (ntemps
, nopened
), nopened
, tfp
, temp
->name
,
3866 ntemps
-= MIN (ntemps
, nopened
);
3867 files
[0].name
= temp
->name
;
3868 files
[0].temp
= temp
;
3870 memmove (&files
[1], &files
[nopened
], (nfiles
- nopened
) * sizeof *files
);
3872 nfiles
-= nopened
- 1;
3876 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3879 sort (char *const *files
, size_t nfiles
, char const *output_file
,
3883 IF_LINT (buf
.buf
= NULL
);
3885 bool output_file_created
= false;
3891 char const *temp_output
;
3892 char const *file
= *files
;
3893 FILE *fp
= xfopen (file
, "r");
3896 size_t bytes_per_line
;
3902 while (tmp
< nthreads
)
3907 bytes_per_line
= (mult
* sizeof (struct line
));
3910 bytes_per_line
= sizeof (struct line
) * 3 / 2;
3913 initbuf (&buf
, bytes_per_line
,
3914 sort_buffer_size (&fp
, 1, files
, nfiles
, bytes_per_line
));
3919 while (fillbuf (&buf
, fp
, file
))
3923 if (buf
.eof
&& nfiles
3924 && (bytes_per_line
+ 1
3925 < (buf
.alloc
- buf
.used
- bytes_per_line
* buf
.nlines
)))
3927 /* End of file, but there is more input and buffer room.
3928 Concatenate the next input file; this is faster in
3930 buf
.left
= buf
.used
;
3934 saved_line
.text
= NULL
;
3935 line
= buffer_linelim (&buf
);
3936 if (buf
.eof
&& !nfiles
&& !ntemps
&& !buf
.left
)
3939 tfp
= xfopen (output_file
, "w");
3940 temp_output
= output_file
;
3941 output_file_created
= true;
3946 temp_output
= create_temp (&tfp
)->name
;
3950 struct merge_node_queue queue
;
3951 queue_init (&queue
, nthreads
);
3952 struct merge_node
*merge_tree
=
3953 merge_tree_init (nthreads
, buf
.nlines
, line
);
3955 sortlines (line
, nthreads
, buf
.nlines
, merge_tree
+ 1,
3956 &queue
, tfp
, temp_output
);
3959 merge_tree_destroy (nthreads
, merge_tree
);
3960 queue_destroy (&queue
);
3964 write_unique (line
- 1, tfp
, temp_output
);
3966 xfclose (tfp
, temp_output
);
3968 if (output_file_created
)
3977 if (! output_file_created
)
3980 struct tempnode
*node
= temphead
;
3981 struct sortfile
*tempfiles
= xnmalloc (ntemps
, sizeof *tempfiles
);
3982 for (i
= 0; node
; i
++)
3984 tempfiles
[i
].name
= node
->name
;
3985 tempfiles
[i
].temp
= node
;
3988 merge (tempfiles
, ntemps
, ntemps
, output_file
);
3995 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3998 insertkey (struct keyfield
*key_arg
)
4000 struct keyfield
**p
;
4001 struct keyfield
*key
= xmemdup (key_arg
, sizeof *key
);
4003 for (p
= &keylist
; *p
; p
= &(*p
)->next
)
4009 /* Report a bad field specification SPEC, with extra info MSGID. */
4011 static void badfieldspec (char const *, char const *)
4014 badfieldspec (char const *spec
, char const *msgid
)
4016 error (SORT_FAILURE
, 0, _("%s: invalid field specification %s"),
4017 _(msgid
), quote (spec
));
4021 /* Report incompatible options. */
4023 static void incompatible_options (char const *) ATTRIBUTE_NORETURN
;
4025 incompatible_options (char const *opts
)
4027 error (SORT_FAILURE
, 0, _("options '-%s' are incompatible"), (opts
));
4031 /* Check compatibility of ordering options. */
4034 check_ordering_compatibility (void)
4036 struct keyfield
*key
;
4038 for (key
= keylist
; key
; key
= key
->next
)
4039 if (1 < (key
->numeric
+ key
->general_numeric
+ key
->human_numeric
4040 + key
->month
+ (key
->version
| key
->random
| !!key
->ignore
)))
4042 /* The following is too big, but guaranteed to be "big enough". */
4043 char opts
[sizeof short_options
];
4044 /* Clear flags we're not interested in. */
4045 key
->skipsblanks
= key
->skipeblanks
= key
->reverse
= false;
4046 key_to_opts (key
, opts
);
4047 incompatible_options (opts
);
4051 /* Parse the leading integer in STRING and store the resulting value
4052 (which must fit into size_t) into *VAL. Return the address of the
4053 suffix after the integer. If the value is too large, silently
4054 substitute SIZE_MAX. If MSGID is NULL, return NULL after
4055 failure; otherwise, report MSGID and exit on failure. */
4058 parse_field_count (char const *string
, size_t *val
, char const *msgid
)
4063 switch (xstrtoumax (string
, &suffix
, 10, &n
, ""))
4066 case LONGINT_INVALID_SUFFIX_CHAR
:
4071 case LONGINT_OVERFLOW
:
4072 case LONGINT_OVERFLOW
| LONGINT_INVALID_SUFFIX_CHAR
:
4076 case LONGINT_INVALID
:
4078 error (SORT_FAILURE
, 0, _("%s: invalid count at start of %s"),
4079 _(msgid
), quote (string
));
4086 /* Handle interrupts and hangups. */
4089 sighandler (int sig
)
4092 signal (sig
, SIG_IGN
);
4096 signal (sig
, SIG_DFL
);
4100 /* Set the ordering options for KEY specified in S.
4101 Return the address of the first character in S that
4102 is not a valid ordering option.
4103 BLANKTYPE is the kind of blanks that 'b' should skip. */
4106 set_ordering (char const *s
, struct keyfield
*key
, enum blanktype blanktype
)
4113 if (blanktype
== bl_start
|| blanktype
== bl_both
)
4114 key
->skipsblanks
= true;
4115 if (blanktype
== bl_end
|| blanktype
== bl_both
)
4116 key
->skipeblanks
= true;
4119 key
->ignore
= nondictionary
;
4122 key
->translate
= fold_toupper
;
4125 key
->general_numeric
= true;
4128 key
->human_numeric
= true;
4131 /* Option order should not matter, so don't let -i override
4132 -d. -d implies -i, but -i does not imply -d. */
4134 key
->ignore
= nonprinting
;
4140 key
->numeric
= true;
4146 key
->reverse
= true;
4149 key
->version
= true;
4159 /* Initialize KEY. */
4161 static struct keyfield
*
4162 key_init (struct keyfield
*key
)
4164 memset (key
, 0, sizeof *key
);
4165 key
->eword
= SIZE_MAX
;
4170 main (int argc
, char **argv
)
4172 struct keyfield
*key
;
4173 struct keyfield key_buf
;
4174 struct keyfield gkey
;
4175 bool gkey_only
= false;
4179 bool mergeonly
= false;
4180 char *random_source
= NULL
;
4181 bool need_random
= false;
4182 size_t nthreads
= 0;
4184 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != NULL
);
4185 bool obsolete_usage
= (posix2_version () < 200112);
4187 char *files_from
= NULL
;
4189 char const *outfile
= NULL
;
4192 initialize_main (&argc
, &argv
);
4193 set_program_name (argv
[0]);
4194 locale_ok
= setlocale (LC_ALL
, "");
4195 bindtextdomain (PACKAGE
, LOCALEDIR
);
4196 textdomain (PACKAGE
);
4198 initialize_exit_failure (SORT_FAILURE
);
4200 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
4201 #if HAVE_NL_LANGINFO
4202 hard_LC_TIME
= hard_locale (LC_TIME
);
4205 /* Get locale's representation of the decimal point. */
4207 struct lconv
const *locale
= localeconv ();
4209 /* If the locale doesn't define a decimal point, or if the decimal
4210 point is multibyte, use the C locale's decimal point. FIXME:
4211 add support for multibyte decimal points. */
4212 decimal_point
= to_uchar (locale
->decimal_point
[0]);
4213 if (! decimal_point
|| locale
->decimal_point
[1])
4214 decimal_point
= '.';
4216 /* FIXME: add support for multibyte thousands separators. */
4217 thousands_sep
= to_uchar (*locale
->thousands_sep
);
4218 if (! thousands_sep
|| locale
->thousands_sep
[1])
4222 have_read_stdin
= false;
4227 static int const sig
[] =
4229 /* The usual suspects. */
4230 SIGALRM
, SIGHUP
, SIGINT
, SIGPIPE
, SIGQUIT
, SIGTERM
,
4247 enum { nsigs
= ARRAY_CARDINALITY (sig
) };
4250 struct sigaction act
;
4252 sigemptyset (&caught_signals
);
4253 for (i
= 0; i
< nsigs
; i
++)
4255 sigaction (sig
[i
], NULL
, &act
);
4256 if (act
.sa_handler
!= SIG_IGN
)
4257 sigaddset (&caught_signals
, sig
[i
]);
4260 act
.sa_handler
= sighandler
;
4261 act
.sa_mask
= caught_signals
;
4264 for (i
= 0; i
< nsigs
; i
++)
4265 if (sigismember (&caught_signals
, sig
[i
]))
4266 sigaction (sig
[i
], &act
, NULL
);
4268 for (i
= 0; i
< nsigs
; i
++)
4269 if (signal (sig
[i
], SIG_IGN
) != SIG_IGN
)
4271 signal (sig
[i
], sighandler
);
4272 siginterrupt (sig
[i
], 1);
4276 signal (SIGCHLD
, SIG_DFL
); /* Don't inherit CHLD handling from parent. */
4278 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4279 atexit (exit_cleanup
);
4282 gkey
.sword
= SIZE_MAX
;
4284 files
= xnmalloc (argc
, sizeof *files
);
4288 /* Parse an operand as a file after "--" was seen; or if
4289 pedantic and a file was seen, unless the POSIX version
4290 predates 1003.1-2001 and -c was not seen and the operand is
4291 "-o FILE" or "-oFILE". */
4295 || (posixly_correct
&& nfiles
!= 0
4296 && ! (obsolete_usage
4299 && argv
[optind
][0] == '-' && argv
[optind
][1] == 'o'
4300 && (argv
[optind
][2] || optind
+ 1 != argc
)))
4301 || ((c
= getopt_long (argc
, argv
, short_options
,
4307 files
[nfiles
++] = argv
[optind
++];
4313 if (optarg
[0] == '+')
4315 bool minus_pos_usage
= (optind
!= argc
&& argv
[optind
][0] == '-'
4316 && ISDIGIT (argv
[optind
][1]));
4317 obsolete_usage
|= minus_pos_usage
&& !posixly_correct
;
4320 /* Treat +POS1 [-POS2] as a key if possible; but silently
4321 treat an operand as a file if it is not a valid +POS1. */
4322 key
= key_init (&key_buf
);
4323 s
= parse_field_count (optarg
+ 1, &key
->sword
, NULL
);
4325 s
= parse_field_count (s
+ 1, &key
->schar
, NULL
);
4326 if (! (key
->sword
|| key
->schar
))
4327 key
->sword
= SIZE_MAX
;
4328 if (! s
|| *set_ordering (s
, key
, bl_start
))
4332 if (minus_pos_usage
)
4334 char const *optarg1
= argv
[optind
++];
4335 s
= parse_field_count (optarg1
+ 1, &key
->eword
,
4336 N_("invalid number after '-'"));
4337 /* When called with a non-NULL message ID,
4338 parse_field_count cannot return NULL. Tell static
4339 analysis tools that dereferencing S is safe. */
4342 s
= parse_field_count (s
+ 1, &key
->echar
,
4343 N_("invalid number after '.'"));
4344 if (!key
->echar
&& key
->eword
)
4346 /* obsolescent syntax +A.x -B.y is equivalent to:
4347 -k A+1.x+1,B.y (when y = 0)
4348 -k A+1.x+1,B+1.y (when y > 0)
4349 So eword is decremented as in the -k case
4350 only when the end field (B) is specified and
4354 if (*set_ordering (s
, key
, bl_end
))
4355 badfieldspec (optarg1
,
4356 N_("stray character in field spec"));
4358 key
->obsolete_used
= true;
4364 files
[nfiles
++] = optarg
;
4368 c
= XARGMATCH ("--sort", optarg
, sort_args
, sort_types
);
4385 set_ordering (str
, &gkey
, bl_both
);
4391 ? XARGMATCH ("--check", optarg
, check_args
, check_types
)
4396 if (checkonly
&& checkonly
!= c
)
4397 incompatible_options ("cC");
4401 case COMPRESS_PROGRAM_OPTION
:
4402 if (compress_program
&& !STREQ (compress_program
, optarg
))
4403 error (SORT_FAILURE
, 0, _("multiple compress programs specified"));
4404 compress_program
= optarg
;
4407 case DEBUG_PROGRAM_OPTION
:
4411 case FILES0_FROM_OPTION
:
4412 files_from
= optarg
;
4416 key
= key_init (&key_buf
);
4419 s
= parse_field_count (optarg
, &key
->sword
,
4420 N_("invalid number at field start"));
4423 /* Provoke with 'sort -k0' */
4424 badfieldspec (optarg
, N_("field number is zero"));
4428 s
= parse_field_count (s
+ 1, &key
->schar
,
4429 N_("invalid number after '.'"));
4432 /* Provoke with 'sort -k1.0' */
4433 badfieldspec (optarg
, N_("character offset is zero"));
4436 if (! (key
->sword
|| key
->schar
))
4437 key
->sword
= SIZE_MAX
;
4438 s
= set_ordering (s
, key
, bl_start
);
4441 key
->eword
= SIZE_MAX
;
4447 s
= parse_field_count (s
+ 1, &key
->eword
,
4448 N_("invalid number after ','"));
4451 /* Provoke with 'sort -k1,0' */
4452 badfieldspec (optarg
, N_("field number is zero"));
4456 s
= parse_field_count (s
+ 1, &key
->echar
,
4457 N_("invalid number after '.'"));
4459 s
= set_ordering (s
, key
, bl_end
);
4462 badfieldspec (optarg
, N_("stray character in field spec"));
4471 specify_nmerge (oi
, c
, optarg
);
4475 if (outfile
&& !STREQ (outfile
, optarg
))
4476 error (SORT_FAILURE
, 0, _("multiple output files specified"));
4480 case RANDOM_SOURCE_OPTION
:
4481 if (random_source
&& !STREQ (random_source
, optarg
))
4482 error (SORT_FAILURE
, 0, _("multiple random sources specified"));
4483 random_source
= optarg
;
4491 specify_sort_size (oi
, c
, optarg
);
4496 char newtab
= optarg
[0];
4498 error (SORT_FAILURE
, 0, _("empty tab"));
4501 if (STREQ (optarg
, "\\0"))
4505 /* Provoke with 'sort -txx'. Complain about
4506 "multi-character tab" instead of "multibyte tab", so
4507 that the diagnostic's wording does not need to be
4508 changed once multibyte characters are supported. */
4509 error (SORT_FAILURE
, 0, _("multi-character tab %s"),
4513 if (tab
!= TAB_DEFAULT
&& tab
!= newtab
)
4514 error (SORT_FAILURE
, 0, _("incompatible tabs"));
4520 add_temp_dir (optarg
);
4523 case PARALLEL_OPTION
:
4524 nthreads
= specify_nthreads (oi
, c
, optarg
);
4532 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4533 through Solaris 7. It is also accepted by many non-Solaris
4534 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4535 -y is marked as obsolete starting with Solaris 8 (1999), but is
4536 still accepted as of Solaris 10 prerelease (2004).
4538 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4539 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4540 and which in general ignores the argument after "-y" if it
4541 consists entirely of digits (it can even be empty). */
4542 if (optarg
== argv
[optind
- 1])
4545 for (p
= optarg
; ISDIGIT (*p
); p
++)
4547 optind
-= (*p
!= '\0');
4555 case_GETOPT_HELP_CHAR
;
4557 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
4560 usage (SORT_FAILURE
);
4568 /* When using --files0-from=F, you may not specify any files
4569 on the command-line. */
4572 error (0, 0, _("extra operand %s"), quoteaf (files
[0]));
4573 fprintf (stderr
, "%s\n",
4574 _("file operands cannot be combined with --files0-from"));
4575 usage (SORT_FAILURE
);
4578 if (STREQ (files_from
, "-"))
4582 stream
= fopen (files_from
, "r");
4584 error (SORT_FAILURE
, errno
, _("cannot open %s for reading"),
4585 quoteaf (files_from
));
4588 readtokens0_init (&tok
);
4590 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
4591 error (SORT_FAILURE
, 0, _("cannot read file names from %s"),
4592 quoteaf (files_from
));
4600 for (i
= 0; i
< nfiles
; i
++)
4602 if (STREQ (files
[i
], "-"))
4603 error (SORT_FAILURE
, 0, _("when reading file names from stdin, "
4604 "no file name of %s allowed"),
4605 quoteaf (files
[i
]));
4606 else if (files
[i
][0] == '\0')
4608 /* Using the standard 'filename:line-number:' prefix here is
4609 not totally appropriate, since NUL is the separator,
4610 not NL, but it might be better than nothing. */
4611 unsigned long int file_number
= i
+ 1;
4612 error (SORT_FAILURE
, 0,
4613 _("%s:%lu: invalid zero-length file name"),
4614 quotef (files_from
), file_number
);
4619 error (SORT_FAILURE
, 0, _("no input from %s"),
4620 quoteaf (files_from
));
4623 /* Inheritance of global options to individual keys. */
4624 for (key
= keylist
; key
; key
= key
->next
)
4626 if (default_key_compare (key
) && !key
->reverse
)
4628 key
->ignore
= gkey
.ignore
;
4629 key
->translate
= gkey
.translate
;
4630 key
->skipsblanks
= gkey
.skipsblanks
;
4631 key
->skipeblanks
= gkey
.skipeblanks
;
4632 key
->month
= gkey
.month
;
4633 key
->numeric
= gkey
.numeric
;
4634 key
->general_numeric
= gkey
.general_numeric
;
4635 key
->human_numeric
= gkey
.human_numeric
;
4636 key
->version
= gkey
.version
;
4637 key
->random
= gkey
.random
;
4638 key
->reverse
= gkey
.reverse
;
4641 need_random
|= key
->random
;
4644 if (!keylist
&& !default_key_compare (&gkey
))
4648 need_random
|= gkey
.random
;
4651 check_ordering_compatibility ();
4655 if (checkonly
|| outfile
)
4657 static char opts
[] = "X --debug";
4658 opts
[0] = (checkonly
? checkonly
: 'o');
4659 incompatible_options (opts
);
4662 /* Always output the locale in debug mode, since this
4663 is such a common source of confusion. */
4664 if (hard_LC_COLLATE
)
4665 error (0, 0, _("using %s sorting rules"),
4666 quote (setlocale (LC_COLLATE
, NULL
)));
4669 error (0, 0, "%s%s", locale_ok
? "" : _("failed to set locale; "),
4670 _("using simple byte comparison"));
4672 key_warnings (&gkey
, gkey_only
);
4675 reverse
= gkey
.reverse
;
4678 random_md5_state_init (random_source
);
4680 if (temp_dir_count
== 0)
4682 char const *tmp_dir
= getenv ("TMPDIR");
4683 add_temp_dir (tmp_dir
? tmp_dir
: DEFAULT_TMPDIR
);
4690 files
= xmalloc (sizeof *files
);
4691 *files
= (char *) "-";
4694 /* Need to re-check that we meet the minimum requirement for memory
4695 usage with the final value for NMERGE. */
4697 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
4702 error (SORT_FAILURE
, 0, _("extra operand %s not allowed with -%c"),
4703 quoteaf (files
[1]), checkonly
);
4707 static char opts
[] = {0, 'o', 0};
4708 opts
[0] = checkonly
;
4709 incompatible_options (opts
);
4712 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4713 input is not properly sorted. */
4714 return check (files
[0], checkonly
) ? EXIT_SUCCESS
: SORT_OUT_OF_ORDER
;
4717 /* Check all inputs are accessible, or exit immediately. */
4718 check_inputs (files
, nfiles
);
4720 /* Check output is writable, or exit immediately. */
4721 check_output (outfile
);
4725 struct sortfile
*sortfiles
= xcalloc (nfiles
, sizeof *sortfiles
);
4728 for (i
= 0; i
< nfiles
; ++i
)
4729 sortfiles
[i
].name
= files
[i
];
4731 merge (sortfiles
, 0, nfiles
, outfile
);
4732 IF_LINT (free (sortfiles
));
4738 unsigned long int np
= num_processors (NPROC_CURRENT_OVERRIDABLE
);
4739 nthreads
= MIN (np
, DEFAULT_MAX_THREADS
);
4742 /* Avoid integer overflow later. */
4743 size_t nthreads_max
= SIZE_MAX
/ (2 * sizeof (struct merge_node
));
4744 nthreads
= MIN (nthreads
, nthreads_max
);
4746 sort (files
, nfiles
, outfile
, nthreads
);
4751 readtokens0_free (&tok
);
4756 if (have_read_stdin
&& fclose (stdin
) == EOF
)
4757 die (_("close failed"), "-");
4759 return EXIT_SUCCESS
;