1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 88, 1991-2004 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written December 1988 by Mike Haertel.
19 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
20 or (US mail) as Mike Haertel c/o Free Software Foundation.
22 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/types.h>
32 #include "hard-locale.h"
34 #include "long-options.h"
37 #include "stdio-safer.h"
41 #if HAVE_SYS_RESOURCE_H
42 # include <sys/resource.h>
45 struct rlimit
{ size_t rlim_cur
; };
46 # define getrlimit(Resource, Rlp) (-1)
49 /* The official name of this program (e.g., no `g' prefix). */
50 #define PROGRAM_NAME "sort"
52 #define AUTHORS "Mike Haertel", "Paul Eggert"
54 #if HAVE_LANGINFO_CODESET
55 # include <langinfo.h>
59 # define sigprocmask(How, Set, Oset) /* empty */
67 #define UCHAR_LIM (UCHAR_MAX + 1)
68 #define UCHAR(c) ((unsigned char) (c))
70 #ifndef DEFAULT_TMPDIR
71 # define DEFAULT_TMPDIR "/tmp"
77 /* POSIX says to exit with status 1 if invoked with -c and the
78 input is not properly sorted. */
79 SORT_OUT_OF_ORDER
= 1,
81 /* POSIX says any other irregular exit must exit with a status
82 code greater than 1. */
86 #define C_DECIMAL_POINT '.'
87 #define NEGATION_SIGN '-'
88 #define NUMERIC_ZERO '0'
92 static char decimal_point
;
93 static int th_sep
; /* if CHAR_MAX + 1, then there is no thousands separator */
95 /* Nonzero if the corresponding locales are hard. */
96 static bool hard_LC_COLLATE
;
98 static bool hard_LC_TIME
;
101 # define IS_THOUSANDS_SEP(x) ((x) == th_sep)
105 # define decimal_point C_DECIMAL_POINT
106 # define IS_THOUSANDS_SEP(x) 0
110 #define NONZERO(x) (x != 0)
112 /* The kind of blanks for '-b' to skip in various options. */
113 enum blanktype
{ bl_start
, bl_end
, bl_both
};
115 /* The character marking end of line. Default to \n. */
116 static char eolchar
= '\n';
118 /* Lines are held in core as counted strings. */
121 char *text
; /* Text of the line. */
122 size_t length
; /* Length including final newline. */
123 char *keybeg
; /* Start of first key. */
124 char *keylim
; /* Limit of first key. */
130 char *buf
; /* Dynamically allocated buffer,
131 partitioned into 3 regions:
134 - an array of lines, in reverse order. */
135 size_t used
; /* Number of bytes used for input data. */
136 size_t nlines
; /* Number of lines in the line array. */
137 size_t alloc
; /* Number of bytes allocated. */
138 size_t left
; /* Number of bytes left from previous reads. */
139 size_t line_bytes
; /* Number of bytes to reserve for each line. */
140 bool eof
; /* An EOF has been read. */
145 size_t sword
; /* Zero-origin 'word' to start at. */
146 size_t schar
; /* Additional characters to skip. */
147 size_t eword
; /* Zero-origin first word after field. */
148 size_t echar
; /* Additional characters in field. */
149 bool const *ignore
; /* Boolean array of characters to ignore. */
150 char const *translate
; /* Translation applied to characters. */
151 bool skipsblanks
; /* Skip leading blanks at start. */
152 bool skipeblanks
; /* Skip trailing blanks at finish. */
153 bool numeric
; /* Flag for numeric comparison. Handle
154 strings of digits with optional decimal
155 point, but no exponential notation. */
156 bool general_numeric
; /* Flag for general, numeric comparison.
157 Handle numbers in exponential notation. */
158 bool month
; /* Flag for comparison by month name. */
159 bool reverse
; /* Reverse the sense of comparison. */
160 struct keyfield
*next
; /* Next keyfield to try. */
169 /* The name this program was run with. */
172 /* FIXME: None of these tables work with multibyte character sets.
173 Also, there are many other bugs when handling multibyte characters.
174 One way to fix this is to rewrite `sort' to use wide characters
175 internally, but doing this with good performance is a bit
178 /* Table of blanks. */
179 static bool blanks
[UCHAR_LIM
];
181 /* Table of non-printing characters. */
182 static bool nonprinting
[UCHAR_LIM
];
184 /* Table of non-dictionary characters (not letters, digits, or blanks). */
185 static bool nondictionary
[UCHAR_LIM
];
187 /* Translation table folding lower case to upper. */
188 static char fold_toupper
[UCHAR_LIM
];
190 #define MONTHS_PER_YEAR 12
192 /* Table mapping month names to integers.
193 Alphabetic order allows binary search. */
194 static struct month monthtab
[] =
210 /* During the merge phase, the number of files to merge at once. */
213 /* Minimum size for a merge or check buffer. */
214 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
216 /* Minimum sort size; the code might not work with smaller sizes. */
217 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
219 /* The number of bytes needed for a merge or check buffer, which can
220 function relatively efficiently even if it holds only one line. If
221 a longer line is seen, this value is increased. */
222 static size_t merge_buffer_size
= MAX (MIN_MERGE_BUFFER_SIZE
, 256 * 1024);
224 /* The approximate maximum number of bytes of main memory to use, as
225 specified by the user. Zero if the user has not specified a size. */
226 static size_t sort_size
;
228 /* The guessed size for non-regular files. */
229 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
231 /* Array of directory names in which any temporary files are to be created. */
232 static char const **temp_dirs
;
234 /* Number of temporary directory names used. */
235 static size_t temp_dir_count
;
237 /* Number of allocated slots in temp_dirs. */
238 static size_t temp_dir_alloc
;
240 /* Flag to reverse the order of all comparisons. */
243 /* Flag for stable sort. This turns off the last ditch bytewise
244 comparison of lines, and instead leaves lines in the same order
245 they were read if all keys compare equal. */
248 /* If TAB has this value, blanks separate fields. */
249 enum { TAB_DEFAULT
= CHAR_MAX
+ 1 };
251 /* Tab character separating fields. If TAB_DEFAULT, then fields are
252 separated by the empty string between a non-blank character and a blank
254 static int tab
= TAB_DEFAULT
;
256 /* Flag to remove consecutive duplicate lines from the output.
257 Only the last of a sequence of equal lines will be output. */
260 /* Nonzero if any of the input files are the standard input. */
261 static bool have_read_stdin
;
263 /* List of key field comparisons to be tried. */
264 static struct keyfield
*keylist
;
266 static void sortlines_temp (struct line
*, size_t, struct line
*);
271 if (status
!= EXIT_SUCCESS
)
272 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
277 Usage: %s [OPTION]... [FILE]...\n\
281 Write sorted concatenation of all FILE(s) to standard output.\n\
287 Mandatory arguments to long options are mandatory for short options too.\n\
290 -b, --ignore-leading-blanks ignore leading blanks\n\
291 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
292 -f, --ignore-case fold lower case to upper case characters\n\
295 -g, --general-numeric-sort compare according to general numerical value\n\
296 -i, --ignore-nonprinting consider only printable characters\n\
297 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
298 -n, --numeric-sort compare according to string numerical value\n\
299 -r, --reverse reverse the result of comparisons\n\
305 -c, --check check whether input is sorted; do not sort\n\
306 -k, --key=POS1[,POS2] start a key at POS1, end it at POS 2 (origin 1)\n\
307 -m, --merge merge already sorted files; do not sort\n\
308 -o, --output=FILE write result to FILE instead of standard output\n\
309 -s, --stable stabilize sort by disabling last-resort comparison\n\
310 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
313 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
314 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
315 multiple options specify multiple directories\n\
316 -u, --unique with -c, check for strict ordering;\n\
317 without -c, output only the first of an equal run\n\
320 -z, --zero-terminated end lines with 0 byte, not newline\n\
322 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
323 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
326 POS is F[.C][OPTS], where F is the field number and C the character position\n\
327 in the field. OPTS is one or more single-letter ordering options, which\n\
328 override global ordering options for that key. If no key is given, use the\n\
329 entire line as the key.\n\
331 SIZE may be followed by the following multiplicative suffixes:\n\
334 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
336 With no FILE, or when FILE is -, read standard input.\n\
339 The locale specified by the environment affects sort order.\n\
340 Set LC_ALL=C to get the traditional sort order that uses\n\
341 native byte values.\n\
343 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT
);
349 #define COMMON_SHORT_OPTIONS "-bcdfgik:mMno:rsS:t:T:uz"
351 static struct option
const long_options
[] =
353 {"ignore-leading-blanks", no_argument
, NULL
, 'b'},
354 {"check", no_argument
, NULL
, 'c'},
355 {"dictionary-order", no_argument
, NULL
, 'd'},
356 {"ignore-case", no_argument
, NULL
, 'f'},
357 {"general-numeric-sort", no_argument
, NULL
, 'g'},
358 {"ignore-nonprinting", no_argument
, NULL
, 'i'},
359 {"key", required_argument
, NULL
, 'k'},
360 {"merge", no_argument
, NULL
, 'm'},
361 {"month-sort", no_argument
, NULL
, 'M'},
362 {"numeric-sort", no_argument
, NULL
, 'n'},
363 {"output", required_argument
, NULL
, 'o'},
364 {"reverse", no_argument
, NULL
, 'r'},
365 {"stable", no_argument
, NULL
, 's'},
366 {"buffer-size", required_argument
, NULL
, 'S'},
367 {"field-separator", required_argument
, NULL
, 't'},
368 {"temporary-directory", required_argument
, NULL
, 'T'},
369 {"unique", no_argument
, NULL
, 'u'},
370 {"zero-terminated", no_argument
, NULL
, 'z'},
371 {GETOPT_HELP_OPTION_DECL
},
372 {GETOPT_VERSION_OPTION_DECL
},
376 /* The set of signals that are caught. */
377 static sigset_t caught_signals
;
379 /* The list of temporary files. */
382 struct tempnode
*volatile next
;
383 char name
[1]; /* Actual size is 1 + file name length. */
385 static struct tempnode
*volatile temphead
;
387 /* Clean up any remaining temporary files. */
392 struct tempnode
const *node
;
394 for (node
= temphead
; node
; node
= node
->next
)
398 /* Report MESSAGE for FILE, then clean up and exit. */
400 static void die (char const *, char const *) ATTRIBUTE_NORETURN
;
402 die (char const *message
, char const *file
)
404 error (0, errno
, "%s: %s", message
, file
);
408 /* Create a new temporary file, returning its newly allocated name.
409 Store into *PFP a stream open for writing. */
412 create_temp_file (FILE **pfp
)
414 static char const slashbase
[] = "/sortXXXXXX";
415 static size_t temp_dir_index
;
419 char const *temp_dir
= temp_dirs
[temp_dir_index
];
420 size_t len
= strlen (temp_dir
);
421 struct tempnode
*node
=
422 xmalloc (sizeof node
->next
+ len
+ sizeof slashbase
);
423 char *file
= node
->name
;
425 memcpy (file
, temp_dir
, len
);
426 memcpy (file
+ len
, slashbase
, sizeof slashbase
);
427 node
->next
= temphead
;
428 if (++temp_dir_index
== temp_dir_count
)
431 /* Create the temporary file in a critical section, to avoid races. */
432 sigprocmask (SIG_BLOCK
, &caught_signals
, &oldset
);
437 sigprocmask (SIG_SETMASK
, &oldset
, NULL
);
440 if (fd
< 0 || (*pfp
= fdopen (fd
, "w")) == NULL
)
441 die (_("cannot create temporary file"), file
);
447 xfopen (const char *file
, const char *how
)
451 if (STREQ (file
, "-"))
455 have_read_stdin
= true;
463 if ((fp
= fopen_safer (file
, how
)) == NULL
)
464 die (_("open failed"), file
);
470 /* Close FP, whose name is FILE, and report any errors. */
473 xfclose (FILE *fp
, char const *file
)
477 /* Allow reading stdin from tty more than once. */
483 if (fclose (fp
) != 0)
484 die (_("close failed"), file
);
489 write_bytes (const char *buf
, size_t n_bytes
, FILE *fp
, const char *output_file
)
491 if (fwrite (buf
, 1, n_bytes
, fp
) != n_bytes
)
492 die (_("write failed"), output_file
);
495 /* Append DIR to the array of temporary directory names. */
497 add_temp_dir (char const *dir
)
499 if (temp_dir_count
== temp_dir_alloc
)
500 temp_dirs
= x2nrealloc (temp_dirs
, &temp_dir_alloc
, sizeof *temp_dirs
);
502 temp_dirs
[temp_dir_count
++] = dir
;
505 /* Search through the list of temporary files for NAME;
506 remove it if it is found on the list. */
509 zaptemp (const char *name
)
511 struct tempnode
*volatile *pnode
;
512 struct tempnode
*node
;
514 for (pnode
= &temphead
; (node
= *pnode
); pnode
= &node
->next
)
515 if (node
->name
== name
)
527 struct_month_cmp (const void *m1
, const void *m2
)
529 struct month
const *month1
= m1
;
530 struct month
const *month2
= m2
;
531 return strcmp (month1
->name
, month2
->name
);
536 /* Initialize the character class tables. */
543 for (i
= 0; i
< UCHAR_LIM
; ++i
)
545 blanks
[i
] = !!ISBLANK (i
);
546 nonprinting
[i
] = !ISPRINT (i
);
547 nondictionary
[i
] = !ISALNUM (i
) && !ISBLANK (i
);
548 fold_toupper
[i
] = (ISLOWER (i
) ? toupper (i
) : i
);
552 /* If we're not in the "C" locale, read different names for months. */
555 for (i
= 0; i
< MONTHS_PER_YEAR
; i
++)
562 s
= (char *) nl_langinfo (ABMON_1
+ i
);
564 monthtab
[i
].name
= name
= xmalloc (s_len
+ 1);
565 monthtab
[i
].val
= i
+ 1;
567 for (j
= 0; j
< s_len
; j
++)
568 name
[j
] = fold_toupper
[UCHAR (s
[j
])];
571 qsort ((void *) monthtab
, MONTHS_PER_YEAR
,
572 sizeof *monthtab
, struct_month_cmp
);
577 /* Specify the amount of main memory to use when sorting. */
579 specify_sort_size (char const *s
)
583 enum strtol_error e
= xstrtoumax (s
, &suffix
, 10, &n
, "EgGkKmMPtTYZ");
585 /* The default unit is KiB. */
586 if (e
== LONGINT_OK
&& ISDIGIT (suffix
[-1]))
588 if (n
<= UINTMAX_MAX
/ 1024)
591 e
= LONGINT_OVERFLOW
;
594 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
595 if (e
== LONGINT_INVALID_SUFFIX_CHAR
&& ISDIGIT (suffix
[-1]) && ! suffix
[1])
604 double mem
= physmem_total () * n
/ 100;
606 /* Use "<", not "<=", to avoid problems with rounding. */
607 if (mem
< UINTMAX_MAX
)
613 e
= LONGINT_OVERFLOW
;
620 /* If multiple sort sizes are specified, take the maximum, so
621 that option order does not matter. */
628 sort_size
= MAX (sort_size
, MIN_SORT_SIZE
);
632 e
= LONGINT_OVERFLOW
;
635 STRTOL_FATAL_ERROR (s
, _("sort size"), e
);
638 /* Return the default sort size. */
640 default_sort_size (void)
642 /* Let MEM be available memory or 1/8 of total memory, whichever
644 double avail
= physmem_available ();
645 double total
= physmem_total ();
646 double mem
= MAX (avail
, total
/ 8);
647 struct rlimit rlimit
;
649 /* Let SIZE be MEM, but no more than the maximum object size or
650 system resource limits. Avoid the MIN macro here, as it is not
651 quite right when only one argument is floating point. Don't
652 bother to check for values like RLIM_INFINITY since in practice
653 they are not much less than SIZE_MAX. */
654 size_t size
= SIZE_MAX
;
657 if (getrlimit (RLIMIT_DATA
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
658 size
= rlimit
.rlim_cur
;
660 if (getrlimit (RLIMIT_AS
, &rlimit
) == 0 && rlimit
.rlim_cur
< size
)
661 size
= rlimit
.rlim_cur
;
664 /* Leave a large safety margin for the above limits, as failure can
665 occur when they are exceeded. */
669 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
670 Exceeding RSS is not fatal, but can be quite slow. */
671 if (getrlimit (RLIMIT_RSS
, &rlimit
) == 0 && rlimit
.rlim_cur
/ 16 * 15 < size
)
672 size
= rlimit
.rlim_cur
/ 16 * 15;
675 /* Use no less than the minimum. */
676 return MAX (size
, MIN_SORT_SIZE
);
679 /* Return the sort buffer size to use with the input files identified
680 by FPS and FILES, which are alternate paths to the same files.
681 NFILES gives the number of input files; NFPS may be less. Assume
682 that each input line requires LINE_BYTES extra bytes' worth of line
683 information. Do not exceed a bound on the size: if the bound is
684 not specified by the user, use a default. */
687 sort_buffer_size (FILE *const *fps
, int nfps
,
688 char *const *files
, int nfiles
,
691 /* A bound on the input size. If zero, the bound hasn't been
693 static size_t size_bound
;
695 /* In the worst case, each input byte is a newline. */
696 size_t worst_case_per_input_byte
= line_bytes
+ 1;
698 /* Keep enough room for one extra input line and an extra byte.
699 This extra room might be needed when preparing to read EOF. */
700 size_t size
= worst_case_per_input_byte
+ 1;
704 for (i
= 0; i
< nfiles
; i
++)
710 if ((i
< nfps
? fstat (fileno (fps
[i
]), &st
)
711 : strcmp (files
[i
], "-") == 0 ? fstat (STDIN_FILENO
, &st
)
712 : stat (files
[i
], &st
))
714 die (_("stat failed"), files
[i
]);
716 if (S_ISREG (st
.st_mode
))
717 file_size
= st
.st_size
;
720 /* The file has unknown size. If the user specified a sort
721 buffer size, use that; otherwise, guess the size. */
724 file_size
= INPUT_FILE_SIZE_GUESS
;
729 size_bound
= sort_size
;
731 size_bound
= default_sort_size ();
734 /* Add the amount of memory needed to represent the worst case
735 where the input consists entirely of newlines followed by a
736 single non-newline. Check for overflow. */
737 worst_case
= file_size
* worst_case_per_input_byte
+ 1;
738 if (file_size
!= worst_case
/ worst_case_per_input_byte
739 || size_bound
- size
<= worst_case
)
747 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
748 must be at least sizeof (struct line). Allocate ALLOC bytes
752 initbuf (struct buffer
*buf
, size_t line_bytes
, size_t alloc
)
754 /* Ensure that the line array is properly aligned. If the desired
755 size cannot be allocated, repeatedly halve it until allocation
756 succeeds. The smaller allocation may hurt overall performance,
757 but that's better than failing. */
760 alloc
+= sizeof (struct line
) - alloc
% sizeof (struct line
);
761 buf
->buf
= malloc (alloc
);
765 if (alloc
<= line_bytes
+ 1)
769 buf
->line_bytes
= line_bytes
;
771 buf
->used
= buf
->left
= buf
->nlines
= 0;
775 /* Return one past the limit of the line array. */
777 static inline struct line
*
778 buffer_linelim (struct buffer
const *buf
)
780 return (struct line
*) (buf
->buf
+ buf
->alloc
);
783 /* Return a pointer to the first character of the field specified
787 begfield (const struct line
*line
, const struct keyfield
*key
)
789 register char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
790 register size_t sword
= key
->sword
;
791 register size_t schar
= key
->schar
;
792 register size_t remaining_bytes
;
794 /* The leading field separator itself is included in a field when -t
797 if (tab
!= TAB_DEFAULT
)
798 while (ptr
< lim
&& sword
--)
800 while (ptr
< lim
&& *ptr
!= tab
)
806 while (ptr
< lim
&& sword
--)
808 while (ptr
< lim
&& blanks
[UCHAR (*ptr
)])
810 while (ptr
< lim
&& !blanks
[UCHAR (*ptr
)])
814 if (key
->skipsblanks
)
815 while (ptr
< lim
&& blanks
[UCHAR (*ptr
)])
818 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
819 remaining_bytes
= lim
- ptr
;
820 if (schar
< remaining_bytes
)
828 /* Return the limit of (a pointer to the first character after) the field
829 in LINE specified by KEY. */
832 limfield (const struct line
*line
, const struct keyfield
*key
)
834 register char *ptr
= line
->text
, *lim
= ptr
+ line
->length
- 1;
835 register size_t eword
= key
->eword
, echar
= key
->echar
;
836 register size_t remaining_bytes
;
838 /* Move PTR past EWORD fields or to one past the last byte on LINE,
839 whichever comes first. If there are more than EWORD fields, leave
840 PTR pointing at the beginning of the field having zero-based index,
841 EWORD. If a delimiter character was specified (via -t), then that
842 `beginning' is the first character following the delimiting TAB.
843 Otherwise, leave PTR pointing at the first `blank' character after
844 the preceding field. */
845 if (tab
!= TAB_DEFAULT
)
846 while (ptr
< lim
&& eword
--)
848 while (ptr
< lim
&& *ptr
!= tab
)
850 if (ptr
< lim
&& (eword
| echar
))
854 while (ptr
< lim
&& eword
--)
856 while (ptr
< lim
&& blanks
[UCHAR (*ptr
)])
858 while (ptr
< lim
&& !blanks
[UCHAR (*ptr
)])
862 #ifdef POSIX_UNSPECIFIED
863 /* The following block of code makes GNU sort incompatible with
864 standard Unix sort, so it's ifdef'd out for now.
865 The POSIX spec isn't clear on how to interpret this.
866 FIXME: request clarification.
868 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
869 Date: Thu, 30 May 96 12:20:41 -0400
870 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
872 [...]I believe I've found another bug in `sort'.
877 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
880 $ /bin/sort -k1.7,1.7 </tmp/sort.in
884 Unix sort produced the answer I expected: sort on the single character
885 in column 7. GNU sort produced different results, because it disagrees
886 on the interpretation of the key-end spec "M.N". Unix sort reads this
887 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
888 "skip M-1 fields, then either N-1 characters or the rest of the current
889 field, whichever comes first". This extra clause applies only to
890 key-ends, not key-starts.
893 /* Make LIM point to the end of (one byte past) the current field. */
894 if (tab
!= TAB_DEFAULT
)
897 newlim
= memchr (ptr
, tab
, lim
- ptr
);
905 while (newlim
< lim
&& blanks
[UCHAR (*newlim
)])
907 while (newlim
< lim
&& !blanks
[UCHAR (*newlim
)])
913 /* If we're skipping leading blanks, don't start counting characters
914 until after skipping past any leading blanks. */
915 if (key
->skipsblanks
)
916 while (ptr
< lim
&& blanks
[UCHAR (*ptr
)])
919 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
920 remaining_bytes
= lim
- ptr
;
921 if (echar
< remaining_bytes
)
929 /* Return the number of trailing blanks in FIELD, with LEN bytes. */
932 trailing_blanks (char const *field
, size_t len
)
935 for (i
= len
; 0 < i
&& blanks
[UCHAR (field
[i
- 1])]; i
--)
940 /* Fill BUF reading from FP, moving buf->left bytes from the end
941 of buf->buf to the beginning first. If EOF is reached and the
942 file wasn't terminated by a newline, supply one. Set up BUF's line
943 table too. FILE is the name of the file corresponding to FP.
944 Return true if some input was read. */
947 fillbuf (struct buffer
*buf
, register FILE *fp
, char const *file
)
949 struct keyfield
const *key
= keylist
;
951 size_t line_bytes
= buf
->line_bytes
;
952 size_t mergesize
= merge_buffer_size
- MIN_MERGE_BUFFER_SIZE
;
957 if (buf
->used
!= buf
->left
)
959 memmove (buf
->buf
, buf
->buf
+ buf
->used
- buf
->left
, buf
->left
);
960 buf
->used
= buf
->left
;
966 char *ptr
= buf
->buf
+ buf
->used
;
967 struct line
*linelim
= buffer_linelim (buf
);
968 struct line
*line
= linelim
- buf
->nlines
;
969 size_t avail
= (char *) linelim
- buf
->nlines
* line_bytes
- ptr
;
970 char *line_start
= buf
->nlines
? line
->text
+ line
->length
: buf
->buf
;
972 while (line_bytes
+ 1 < avail
)
974 /* Read as many bytes as possible, but do not read so many
975 bytes that there might not be enough room for the
976 corresponding line array. The worst case is when the
977 rest of the input file consists entirely of newlines,
978 except that the last byte is not a newline. */
979 size_t readsize
= (avail
- 1) / (line_bytes
+ 1);
980 size_t bytes_read
= fread (ptr
, 1, readsize
, fp
);
981 char *ptrlim
= ptr
+ bytes_read
;
985 if (bytes_read
!= readsize
)
988 die (_("read failed"), file
);
992 if (buf
->buf
== ptrlim
)
994 if (ptrlim
[-1] != eol
)
999 /* Find and record each line in the just-read input. */
1000 while ((p
= memchr (ptr
, eol
, ptrlim
- ptr
)))
1004 line
->text
= line_start
;
1005 line
->length
= ptr
- line_start
;
1006 mergesize
= MAX (mergesize
, line
->length
);
1007 avail
-= line_bytes
;
1011 /* Precompute the position of the first key for
1013 line
->keylim
= (key
->eword
== SIZE_MAX
1015 : limfield (line
, key
));
1017 if (key
->sword
!= SIZE_MAX
)
1018 line
->keybeg
= begfield (line
, key
);
1021 if (key
->skipsblanks
)
1022 while (blanks
[UCHAR (*line_start
)])
1024 line
->keybeg
= line_start
;
1026 if (key
->skipeblanks
)
1028 size_t keylen
= line
->keylim
- line
->keybeg
;
1029 line
->keylim
-= trailing_blanks (line
->keybeg
, keylen
);
1041 buf
->used
= ptr
- buf
->buf
;
1042 buf
->nlines
= buffer_linelim (buf
) - line
;
1043 if (buf
->nlines
!= 0)
1045 buf
->left
= ptr
- line_start
;
1046 merge_buffer_size
= mergesize
+ MIN_MERGE_BUFFER_SIZE
;
1050 /* The current input line is too long to fit in the buffer.
1051 Double the buffer size and try again. */
1052 buf
->buf
= x2nrealloc (buf
->buf
, &buf
->alloc
, sizeof *(buf
->buf
));
1056 /* Compare strings A and B containing decimal fractions < 1. Each string
1057 should begin with a decimal point followed immediately by the digits
1058 of the fraction. Strings not of this form are considered to be zero. */
1060 /* The goal here, is to take two numbers a and b... compare these
1061 in parallel. Instead of converting each, and then comparing the
1062 outcome. Most likely stopping the comparison before the conversion
1063 is complete. The algorithm used, in the old sort:
1065 Algorithm: fraccompare
1066 Action : compare two decimal fractions
1067 accepts : char *a, char *b
1068 returns : -1 if a<b, 0 if a=b, 1 if a>b.
1071 if *a == decimal_point AND *b == decimal_point
1072 find first character different in a and b.
1073 if both are digits, return the difference *a - *b.
1076 if digit return 1, else 0
1079 if digit return -1, else 0
1080 if *a is a decimal_point
1081 skip past decimal_point and zeros
1082 if digit return 1, else 0
1083 if *b is a decimal_point
1084 skip past decimal_point and zeros
1085 if digit return -1, else 0
1089 fraccompare (register const char *a
, register const char *b
)
1091 if (*a
== decimal_point
&& *b
== decimal_point
)
1093 while (*++a
== *++b
)
1096 if (ISDIGIT (*a
) && ISDIGIT (*b
))
1099 goto a_trailing_nonzero
;
1101 goto b_trailing_nonzero
;
1104 else if (*a
++ == decimal_point
)
1107 while (*a
== NUMERIC_ZERO
)
1109 return ISDIGIT (*a
);
1111 else if (*b
++ == decimal_point
)
1114 while (*b
== NUMERIC_ZERO
)
1116 return - ISDIGIT (*b
);
1121 /* Compare strings A and B as numbers without explicitly converting them to
1122 machine numbers. Comparatively slow for short strings, but asymptotically
1126 numcompare (register const char *a
, register const char *b
)
1128 register int tmpa
, tmpb
, tmp
;
1129 register size_t log_a
, log_b
;
1134 while (blanks
[UCHAR (tmpa
)])
1136 while (blanks
[UCHAR (tmpb
)])
1139 if (tmpa
== NEGATION_SIGN
)
1143 while (tmpa
== NUMERIC_ZERO
|| IS_THOUSANDS_SEP (tmpa
));
1144 if (tmpb
!= NEGATION_SIGN
)
1146 if (tmpa
== decimal_point
)
1149 while (tmpa
== NUMERIC_ZERO
);
1152 while (tmpb
== NUMERIC_ZERO
|| IS_THOUSANDS_SEP (tmpb
))
1154 if (tmpb
== decimal_point
)
1157 while (tmpb
== NUMERIC_ZERO
);
1164 while (tmpb
== NUMERIC_ZERO
|| IS_THOUSANDS_SEP (tmpb
));
1166 while (tmpa
== tmpb
&& ISDIGIT (tmpa
))
1170 while (IS_THOUSANDS_SEP (tmpa
));
1173 while (IS_THOUSANDS_SEP (tmpb
));
1176 if ((tmpa
== decimal_point
&& !ISDIGIT (tmpb
))
1177 || (tmpb
== decimal_point
&& !ISDIGIT (tmpa
)))
1178 return -fraccompare (a
, b
);
1182 for (log_a
= 0; ISDIGIT (tmpa
); ++log_a
)
1185 while (IS_THOUSANDS_SEP (tmpa
));
1187 for (log_b
= 0; ISDIGIT (tmpb
); ++log_b
)
1190 while (IS_THOUSANDS_SEP (tmpb
));
1193 return log_a
< log_b
? 1 : -1;
1200 else if (tmpb
== NEGATION_SIGN
)
1204 while (tmpb
== NUMERIC_ZERO
|| IS_THOUSANDS_SEP (tmpb
));
1205 if (tmpb
== decimal_point
)
1208 while (tmpb
== NUMERIC_ZERO
);
1211 while (tmpa
== NUMERIC_ZERO
|| IS_THOUSANDS_SEP (tmpa
))
1213 if (tmpa
== decimal_point
)
1216 while (tmpa
== NUMERIC_ZERO
);
1223 while (tmpa
== NUMERIC_ZERO
|| IS_THOUSANDS_SEP (tmpa
))
1225 while (tmpb
== NUMERIC_ZERO
|| IS_THOUSANDS_SEP (tmpb
))
1228 while (tmpa
== tmpb
&& ISDIGIT (tmpa
))
1232 while (IS_THOUSANDS_SEP (tmpa
));
1235 while (IS_THOUSANDS_SEP (tmpb
));
1238 if ((tmpa
== decimal_point
&& !ISDIGIT (tmpb
))
1239 || (tmpb
== decimal_point
&& !ISDIGIT (tmpa
)))
1240 return fraccompare (a
, b
);
1244 for (log_a
= 0; ISDIGIT (tmpa
); ++log_a
)
1247 while (IS_THOUSANDS_SEP (tmpa
));
1249 for (log_b
= 0; ISDIGIT (tmpb
); ++log_b
)
1252 while (IS_THOUSANDS_SEP (tmpb
));
1255 return log_a
< log_b
? -1 : 1;
1265 general_numcompare (const char *sa
, const char *sb
)
1267 /* FIXME: add option to warn about failed conversions. */
1268 /* FIXME: maybe add option to try expensive FP conversion
1269 only if A and B can't be compared more cheaply/accurately. */
1273 double a
= strtod (sa
, &ea
);
1274 double b
= strtod (sb
, &eb
);
1276 /* Put conversion errors at the start of the collating sequence. */
1278 return sb
== eb
? 0 : -1;
1282 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1283 conversion errors but before numbers; sort them by internal
1284 bit-pattern, for lack of a more portable alternative. */
1290 : memcmp ((char *) &a
, (char *) &b
, sizeof a
));
1293 /* Return an integer in 1..12 of the month name S with length LEN.
1294 Return 0 if the name in S is not recognized. */
1297 getmonth (const char *s
, size_t len
)
1301 register int lo
= 0, hi
= MONTHS_PER_YEAR
, result
;
1303 while (len
> 0 && blanks
[UCHAR (*s
)])
1312 month
= alloca (len
+ 1);
1313 for (i
= 0; i
< len
; ++i
)
1314 month
[i
] = fold_toupper
[UCHAR (s
[i
])];
1315 len
-= trailing_blanks (month
, len
);
1320 int ix
= (lo
+ hi
) / 2;
1322 if (strncmp (month
, monthtab
[ix
].name
, strlen (monthtab
[ix
].name
)) < 0)
1327 while (hi
- lo
> 1);
1329 result
= (!strncmp (month
, monthtab
[lo
].name
, strlen (monthtab
[lo
].name
))
1330 ? monthtab
[lo
].val
: 0);
1335 /* Compare two lines A and B trying every key in sequence until there
1336 are no more keys or a difference is found. */
1339 keycompare (const struct line
*a
, const struct line
*b
)
1341 struct keyfield
const *key
= keylist
;
1343 /* For the first iteration only, the key positions have been
1344 precomputed for us. */
1345 register char *texta
= a
->keybeg
;
1346 register char *textb
= b
->keybeg
;
1347 register char *lima
= a
->keylim
;
1348 register char *limb
= b
->keylim
;
1354 register char const *translate
= key
->translate
;
1355 register bool const *ignore
= key
->ignore
;
1357 /* Find the lengths. */
1358 size_t lena
= lima
<= texta
? 0 : lima
- texta
;
1359 size_t lenb
= limb
<= textb
? 0 : limb
- textb
;
1361 if (key
->skipeblanks
)
1363 lena
-= trailing_blanks (texta
, lena
);
1364 lenb
-= trailing_blanks (textb
, lenb
);
1367 /* Actually compare the fields. */
1368 if (key
->numeric
| key
->general_numeric
)
1370 char savea
= *lima
, saveb
= *limb
;
1372 *lima
= *limb
= '\0';
1373 diff
= ((key
->numeric
? numcompare
: general_numcompare
)
1375 *lima
= savea
, *limb
= saveb
;
1377 else if (key
->month
)
1378 diff
= getmonth (texta
, lena
) - getmonth (textb
, lenb
);
1379 /* Sorting like this may become slow, so in a simple locale the user
1380 can select a faster sort that is similar to ascii sort */
1381 else if (HAVE_SETLOCALE
&& hard_LC_COLLATE
)
1383 if (ignore
|| translate
)
1385 char *copy_a
= alloca (lena
+ 1 + lenb
+ 1);
1386 char *copy_b
= copy_a
+ lena
+ 1;
1387 size_t new_len_a
, new_len_b
, i
;
1389 /* Ignore and/or translate chars before comparing. */
1390 for (new_len_a
= new_len_b
= i
= 0; i
< MAX (lena
, lenb
); i
++)
1394 copy_a
[new_len_a
] = (translate
1395 ? translate
[UCHAR (texta
[i
])]
1397 if (!ignore
|| !ignore
[UCHAR (texta
[i
])])
1402 copy_b
[new_len_b
] = (translate
1403 ? translate
[UCHAR (textb
[i
])]
1405 if (!ignore
|| !ignore
[UCHAR (textb
[i
])])
1410 diff
= xmemcoll (copy_a
, new_len_a
, copy_b
, new_len_b
);
1413 diff
= - NONZERO (lenb
);
1417 diff
= xmemcoll (texta
, lena
, textb
, lenb
);
1421 #define CMP_WITH_IGNORE(A, B) \
1426 while (texta < lima && ignore[UCHAR (*texta)]) \
1428 while (textb < limb && ignore[UCHAR (*textb)]) \
1430 if (! (texta < lima && textb < limb)) \
1432 diff = UCHAR (A) - UCHAR (B); \
1439 diff = (texta < lima) - (textb < limb); \
1444 CMP_WITH_IGNORE (translate
[UCHAR (*texta
)],
1445 translate
[UCHAR (*textb
)]);
1447 CMP_WITH_IGNORE (*texta
, *textb
);
1450 diff
= - NONZERO (lenb
);
1457 while (texta
< lima
&& textb
< limb
)
1459 diff
= (UCHAR (translate
[UCHAR (*texta
++)])
1460 - UCHAR (translate
[UCHAR (*textb
++)]));
1467 diff
= memcmp (texta
, textb
, MIN (lena
, lenb
));
1471 diff
= lena
< lenb
? -1 : lena
!= lenb
;
1481 /* Find the beginning and limit of the next field. */
1482 if (key
->eword
!= SIZE_MAX
)
1483 lima
= limfield (a
, key
), limb
= limfield (b
, key
);
1485 lima
= a
->text
+ a
->length
- 1, limb
= b
->text
+ b
->length
- 1;
1487 if (key
->sword
!= SIZE_MAX
)
1488 texta
= begfield (a
, key
), textb
= begfield (b
, key
);
1491 texta
= a
->text
, textb
= b
->text
;
1492 if (key
->skipsblanks
)
1494 while (texta
< lima
&& blanks
[UCHAR (*texta
)])
1496 while (textb
< limb
&& blanks
[UCHAR (*textb
)])
1507 return key
->reverse
? -diff
: diff
;
1510 /* Compare two lines A and B, returning negative, zero, or positive
1511 depending on whether A compares less than, equal to, or greater than B. */
1514 compare (register const struct line
*a
, register const struct line
*b
)
1519 /* First try to compare on the specified keys (if any).
1520 The only two cases with no key at all are unadorned sort,
1521 and unadorned sort -r. */
1524 diff
= keycompare (a
, b
);
1526 if (diff
| unique
| stable
)
1530 /* If the keys all compare equal (or no keys were specified)
1531 fall through to the default comparison. */
1532 alen
= a
->length
- 1, blen
= b
->length
- 1;
1535 diff
= - NONZERO (blen
);
1538 else if (HAVE_SETLOCALE
&& hard_LC_COLLATE
)
1539 diff
= xmemcoll (a
->text
, alen
, b
->text
, blen
);
1540 else if (! (diff
= memcmp (a
->text
, b
->text
, MIN (alen
, blen
))))
1541 diff
= alen
< blen
? -1 : alen
!= blen
;
1543 return reverse
? -diff
: diff
;
1546 /* Check that the lines read from FILE_NAME come in order. Print a
1547 diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1548 false if they are not in order. Otherwise, print no diagnostic
1552 check (char const *file_name
)
1554 FILE *fp
= xfopen (file_name
, "r");
1555 struct buffer buf
; /* Input buffer. */
1556 struct line temp
; /* Copy of previous line. */
1558 uintmax_t line_number
= 0;
1559 struct keyfield
const *key
= keylist
;
1560 bool nonunique
= ! unique
;
1561 bool ordered
= true;
1563 initbuf (&buf
, sizeof (struct line
),
1564 MAX (merge_buffer_size
, sort_size
));
1567 while (fillbuf (&buf
, fp
, file_name
))
1569 struct line
const *line
= buffer_linelim (&buf
);
1570 struct line
const *linebase
= line
- buf
.nlines
;
1572 /* Make sure the line saved from the old buffer contents is
1573 less than or equal to the first line of the new buffer. */
1574 if (alloc
&& nonunique
<= compare (&temp
, line
- 1))
1578 struct line
const *disorder_line
= line
- 1;
1579 uintmax_t disorder_line_number
=
1580 buffer_linelim (&buf
) - disorder_line
+ line_number
;
1581 char hr_buf
[INT_BUFSIZE_BOUND (uintmax_t)];
1582 fprintf (stderr
, _("%s: %s:%s: disorder: "),
1583 program_name
, file_name
,
1584 umaxtostr (disorder_line_number
, hr_buf
));
1585 write_bytes (disorder_line
->text
, disorder_line
->length
, stderr
,
1586 _("standard error"));
1592 /* Compare each line in the buffer with its successor. */
1593 while (linebase
< --line
)
1594 if (nonunique
<= compare (line
, line
- 1))
1595 goto found_disorder
;
1597 line_number
+= buf
.nlines
;
1599 /* Save the last line of the buffer. */
1600 if (alloc
< line
->length
)
1607 alloc
= line
->length
;
1611 while (alloc
< line
->length
);
1613 temp
.text
= xrealloc (temp
.text
, alloc
);
1615 memcpy (temp
.text
, line
->text
, line
->length
);
1616 temp
.length
= line
->length
;
1619 temp
.keybeg
= temp
.text
+ (line
->keybeg
- line
->text
);
1620 temp
.keylim
= temp
.text
+ (line
->keylim
- line
->text
);
1624 xfclose (fp
, file_name
);
1631 /* Merge lines from FILES onto OFP. NFILES cannot be greater than
1632 NMERGE. Close input and output files before returning.
1633 OUTPUT_FILE gives the name of the output file; if OFP is NULL, the
1634 output file has not been opened yet. */
1637 mergefps (char **files
, register int nfiles
,
1638 FILE *ofp
, const char *output_file
)
1640 FILE *fps
[NMERGE
]; /* Input streams for each file. */
1641 struct buffer buffer
[NMERGE
]; /* Input buffers for each file. */
1642 struct line saved
; /* Saved line storage for unique check. */
1643 struct line
const *savedline
= NULL
;
1644 /* &saved if there is a saved line. */
1645 size_t savealloc
= 0; /* Size allocated for the saved line. */
1646 struct line
const *cur
[NMERGE
]; /* Current line in each line table. */
1647 struct line
const *base
[NMERGE
]; /* Base of each line table. */
1648 int ord
[NMERGE
]; /* Table representing a permutation of fps,
1649 such that cur[ord[0]] is the smallest line
1650 and will be next output. */
1651 register int i
, j
, t
;
1652 struct keyfield
const *key
= keylist
;
1655 /* Read initial lines from each input file. */
1656 for (i
= 0; i
< nfiles
; )
1658 fps
[i
] = xfopen (files
[i
], "r");
1659 initbuf (&buffer
[i
], sizeof (struct line
),
1660 MAX (merge_buffer_size
, sort_size
/ nfiles
));
1661 if (fillbuf (&buffer
[i
], fps
[i
], files
[i
]))
1663 struct line
const *linelim
= buffer_linelim (&buffer
[i
]);
1664 cur
[i
] = linelim
- 1;
1665 base
[i
] = linelim
- buffer
[i
].nlines
;
1670 /* fps[i] is empty; eliminate it from future consideration. */
1671 xfclose (fps
[i
], files
[i
]);
1673 free (buffer
[i
].buf
);
1675 for (j
= i
; j
< nfiles
; ++j
)
1676 files
[j
] = files
[j
+ 1];
1681 ofp
= xfopen (output_file
, "w");
1683 /* Set up the ord table according to comparisons among input lines.
1684 Since this only reorders two items if one is strictly greater than
1685 the other, it is stable. */
1686 for (i
= 0; i
< nfiles
; ++i
)
1688 for (i
= 1; i
< nfiles
; ++i
)
1689 if (0 < compare (cur
[ord
[i
- 1]], cur
[ord
[i
]]))
1690 t
= ord
[i
- 1], ord
[i
- 1] = ord
[i
], ord
[i
] = t
, i
= 0;
1692 /* Repeatedly output the smallest line until no input remains. */
1695 struct line
const *smallest
= cur
[ord
[0]];
1697 /* If uniquified output is turned on, output only the first of
1698 an identical series of lines. */
1701 if (savedline
&& compare (savedline
, smallest
))
1704 write_bytes (saved
.text
, saved
.length
, ofp
, output_file
);
1709 if (savealloc
< smallest
->length
)
1714 savealloc
= smallest
->length
;
1717 while ((savealloc
*= 2) < smallest
->length
);
1719 saved
.text
= xrealloc (saved
.text
, savealloc
);
1721 saved
.length
= smallest
->length
;
1722 memcpy (saved
.text
, smallest
->text
, saved
.length
);
1726 saved
.text
+ (smallest
->keybeg
- smallest
->text
);
1728 saved
.text
+ (smallest
->keylim
- smallest
->text
);
1733 write_bytes (smallest
->text
, smallest
->length
, ofp
, output_file
);
1735 /* Check if we need to read more lines into core. */
1736 if (base
[ord
[0]] < smallest
)
1737 cur
[ord
[0]] = smallest
- 1;
1740 if (fillbuf (&buffer
[ord
[0]], fps
[ord
[0]], files
[ord
[0]]))
1742 struct line
const *linelim
= buffer_linelim (&buffer
[ord
[0]]);
1743 cur
[ord
[0]] = linelim
- 1;
1744 base
[ord
[0]] = linelim
- buffer
[ord
[0]].nlines
;
1748 /* We reached EOF on fps[ord[0]]. */
1749 for (i
= 1; i
< nfiles
; ++i
)
1750 if (ord
[i
] > ord
[0])
1753 xfclose (fps
[ord
[0]], files
[ord
[0]]);
1754 zaptemp (files
[ord
[0]]);
1755 free (buffer
[ord
[0]].buf
);
1756 for (i
= ord
[0]; i
< nfiles
; ++i
)
1758 fps
[i
] = fps
[i
+ 1];
1759 files
[i
] = files
[i
+ 1];
1760 buffer
[i
] = buffer
[i
+ 1];
1761 cur
[i
] = cur
[i
+ 1];
1762 base
[i
] = base
[i
+ 1];
1764 for (i
= 0; i
< nfiles
; ++i
)
1765 ord
[i
] = ord
[i
+ 1];
1770 /* The new line just read in may be larger than other lines
1771 already in core; push it back in the queue until we encounter
1772 a line larger than it. */
1773 for (i
= 1; i
< nfiles
; ++i
)
1775 t
= compare (cur
[ord
[0]], cur
[ord
[i
]]);
1777 t
= ord
[0] - ord
[i
];
1782 for (j
= 1; j
< i
; ++j
)
1783 ord
[j
- 1] = ord
[j
];
1787 if (unique
&& savedline
)
1789 write_bytes (saved
.text
, saved
.length
, ofp
, output_file
);
1793 xfclose (ofp
, output_file
);
1796 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1797 and HI (with NHI members). T, LO, and HI point just past their
1798 respective arrays, and the arrays are in reverse order. NLO and
1799 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
1802 mergelines (struct line
*t
,
1803 struct line
const *lo
, size_t nlo
,
1804 struct line
const *hi
, size_t nhi
)
1807 if (compare (lo
- 1, hi
- 1) <= 0)
1812 /* HI - NHI equalled T - (NLO + NHI) when this function
1813 began. Therefore HI must equal T now, and there is no
1814 need to copy from HI to T. */
1832 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1833 NLINES must be at least 2.
1834 The input and output arrays are in reverse order, and LINES and
1835 TEMP point just past the end of their respective arrays.
1837 Use a recursive divide-and-conquer algorithm, in the style
1838 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
1839 the optimization suggested by exercise 5.2.4-10; this requires room
1840 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
1841 writes that this memory optimization was originally published by
1842 D. A. Bell, Comp J. 1 (1958), 75. */
1845 sortlines (struct line
*lines
, size_t nlines
, struct line
*temp
)
1849 if (0 < compare (&lines
[-1], &lines
[-2]))
1851 struct line tmp
= lines
[-1];
1852 lines
[-1] = lines
[-2];
1858 size_t nlo
= nlines
/ 2;
1859 size_t nhi
= nlines
- nlo
;
1860 struct line
*lo
= lines
;
1861 struct line
*hi
= lines
- nlo
;
1862 struct line
*sorted_lo
= temp
;
1864 sortlines (hi
, nhi
, temp
);
1866 sortlines_temp (lo
, nlo
, sorted_lo
);
1868 sorted_lo
[-1] = lo
[-1];
1870 mergelines (lines
, sorted_lo
, nlo
, hi
, nhi
);
1874 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1875 rather than sorting in place. */
1878 sortlines_temp (struct line
*lines
, size_t nlines
, struct line
*temp
)
1882 bool swap
= (0 < compare (&lines
[-1], &lines
[-2]));
1883 temp
[-1] = lines
[-1 - swap
];
1884 temp
[-2] = lines
[-2 + swap
];
1888 size_t nlo
= nlines
/ 2;
1889 size_t nhi
= nlines
- nlo
;
1890 struct line
*lo
= lines
;
1891 struct line
*hi
= lines
- nlo
;
1892 struct line
*sorted_hi
= temp
- nlo
;
1894 sortlines_temp (hi
, nhi
, sorted_hi
);
1896 sortlines (lo
, nlo
, temp
);
1898 mergelines (temp
, lo
, nlo
, sorted_hi
, nhi
);
1902 /* Return the index of the first of NFILES FILES that is the same file
1903 as OUTFILE. If none can be the same, return NFILES. Consider an
1904 input pipe to be the same as OUTFILE, since the pipe might be the
1905 output of a command like "cat OUTFILE". */
1908 first_same_file (char * const *files
, int nfiles
, char const *outfile
)
1911 bool got_outstat
= false;
1912 struct stat instat
, outstat
;
1914 for (i
= 0; i
< nfiles
; i
++)
1916 bool standard_input
= STREQ (files
[i
], "-");
1918 if (STREQ (outfile
, files
[i
]) && ! standard_input
)
1924 if ((STREQ (outfile
, "-")
1925 ? fstat (STDOUT_FILENO
, &outstat
)
1926 : stat (outfile
, &outstat
))
1931 if (((standard_input
1932 ? fstat (STDIN_FILENO
, &instat
)
1933 : stat (files
[i
], &instat
))
1935 && (S_ISFIFO (instat
.st_mode
) || SAME_INODE (instat
, outstat
)))
1942 /* Merge NFILES FILES onto OUTPUT_FILE. However, merge at most
1943 MAX_MERGE input files directly onto OUTPUT_FILE. MAX_MERGE cannot
1947 merge (char **files
, int nfiles
, int max_merge
, char const *output_file
)
1949 while (max_merge
< nfiles
)
1954 for (i
= 0; i
< nfiles
/ NMERGE
; ++i
)
1956 temp
= create_temp_file (&tfp
);
1957 mergefps (&files
[i
* NMERGE
], NMERGE
, tfp
, temp
);
1960 temp
= create_temp_file (&tfp
);
1961 mergefps (&files
[i
* NMERGE
], nfiles
% NMERGE
, tfp
, temp
);
1968 mergefps (files
, nfiles
, NULL
, output_file
);
1971 /* Sort NFILES FILES onto OUTPUT_FILE. */
1974 sort (char * const *files
, int nfiles
, char const *output_file
)
1977 int n_temp_files
= 0;
1978 bool output_file_created
= false;
1984 char const *temp_output
;
1985 char const *file
= *files
;
1986 FILE *fp
= xfopen (file
, "r");
1988 size_t bytes_per_line
= (2 * sizeof (struct line
)
1989 - sizeof (struct line
) / 2);
1992 initbuf (&buf
, bytes_per_line
,
1993 sort_buffer_size (&fp
, 1, files
, nfiles
, bytes_per_line
));
1998 while (fillbuf (&buf
, fp
, file
))
2001 struct line
*linebase
;
2003 if (buf
.eof
&& nfiles
2004 && (bytes_per_line
+ 1
2005 < (buf
.alloc
- buf
.used
- bytes_per_line
* buf
.nlines
)))
2007 /* End of file, but there is more input and buffer room.
2008 Concatenate the next input file; this is faster in
2010 buf
.left
= buf
.used
;
2014 line
= buffer_linelim (&buf
);
2015 linebase
= line
- buf
.nlines
;
2017 sortlines (line
, buf
.nlines
, linebase
);
2018 if (buf
.eof
&& !nfiles
&& !n_temp_files
&& !buf
.left
)
2021 tfp
= xfopen (output_file
, "w");
2022 temp_output
= output_file
;
2023 output_file_created
= true;
2028 temp_output
= create_temp_file (&tfp
);
2034 write_bytes (line
->text
, line
->length
, tfp
, temp_output
);
2036 while (linebase
< line
&& compare (line
, line
- 1) == 0)
2039 while (linebase
< line
);
2041 xfclose (tfp
, temp_output
);
2043 if (output_file_created
)
2052 if (! output_file_created
)
2054 int i
= n_temp_files
;
2055 struct tempnode
*node
;
2056 char **tempfiles
= xnmalloc (n_temp_files
, sizeof *tempfiles
);
2057 for (node
= temphead
; i
> 0; node
= node
->next
)
2058 tempfiles
[--i
] = node
->name
;
2059 merge (tempfiles
, n_temp_files
, NMERGE
, output_file
);
2064 /* Insert key KEY at the end of the key list. */
2067 insertkey (struct keyfield
*key
)
2069 struct keyfield
**p
;
2071 for (p
= &keylist
; *p
; p
= &(*p
)->next
)
2077 /* Report a bad field specification SPEC, with extra info MSGID. */
2079 static void badfieldspec (char const *, char const *)
2082 badfieldspec (char const *spec
, char const *msgid
)
2084 error (SORT_FAILURE
, 0, _("%s: invalid field specification `%s'"),
2089 /* Parse the leading integer in STRING and store the resulting value
2090 (which must fit into size_t) into *VAL. Return the address of the
2091 suffix after the integer. If MSGID is NULL, return NULL after
2092 failure; otherwise, report MSGID and exit on failure. */
2095 parse_field_count (char const *string
, size_t *val
, char const *msgid
)
2100 switch (xstrtoumax (string
, &suffix
, 10, &n
, ""))
2103 case LONGINT_INVALID_SUFFIX_CHAR
:
2108 case LONGINT_OVERFLOW
:
2109 case LONGINT_OVERFLOW
| LONGINT_INVALID_SUFFIX_CHAR
:
2111 error (SORT_FAILURE
, 0, _("%s: count `%.*s' too large"),
2112 _(msgid
), (int) (suffix
- string
), string
);
2115 case LONGINT_INVALID
:
2117 error (SORT_FAILURE
, 0, _("%s: invalid count at start of `%s'"),
2125 /* Handle interrupts and hangups. */
2128 sighandler (int sig
)
2130 #ifndef SA_NOCLDSTOP
2131 signal (sig
, SIG_IGN
);
2138 struct sigaction sigact
;
2140 sigact
.sa_handler
= SIG_DFL
;
2141 sigemptyset (&sigact
.sa_mask
);
2142 sigact
.sa_flags
= 0;
2143 sigaction (sig
, &sigact
, NULL
);
2146 signal (sig
, SIG_DFL
);
2152 /* Set the ordering options for KEY specified in S.
2153 Return the address of the first character in S that
2154 is not a valid ordering option.
2155 BLANKTYPE is the kind of blanks that 'b' should skip. */
2158 set_ordering (register const char *s
, struct keyfield
*key
,
2159 enum blanktype blanktype
)
2166 if (blanktype
== bl_start
|| blanktype
== bl_both
)
2167 key
->skipsblanks
= true;
2168 if (blanktype
== bl_end
|| blanktype
== bl_both
)
2169 key
->skipeblanks
= true;
2172 key
->ignore
= nondictionary
;
2175 key
->translate
= fold_toupper
;
2178 key
->general_numeric
= true;
2181 /* Option order should not matter, so don't let -i override
2182 -d. -d implies -i, but -i does not imply -d. */
2184 key
->ignore
= nonprinting
;
2190 key
->numeric
= true;
2193 key
->reverse
= true;
2203 static struct keyfield
*
2206 struct keyfield
*key
= xzalloc (sizeof *key
);
2207 key
->eword
= SIZE_MAX
;
2212 main (int argc
, char **argv
)
2214 struct keyfield
*key
;
2215 struct keyfield gkey
;
2218 bool checkonly
= false;
2219 bool mergeonly
= false;
2221 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != NULL
);
2222 bool obsolete_usage
= (posix2_version () < 200112);
2223 char const *short_options
= (obsolete_usage
2224 ? COMMON_SHORT_OPTIONS
"y::"
2225 : COMMON_SHORT_OPTIONS
"y:");
2226 char *minus
= "-", **files
;
2227 char const *outfile
= minus
;
2228 static int const sigs
[] = { SIGHUP
, SIGINT
, SIGPIPE
, SIGTERM
};
2229 unsigned int nsigs
= sizeof sigs
/ sizeof *sigs
;
2231 struct sigaction oldact
, newact
;
2234 initialize_main (&argc
, &argv
);
2235 program_name
= argv
[0];
2236 setlocale (LC_ALL
, "");
2237 bindtextdomain (PACKAGE
, LOCALEDIR
);
2238 textdomain (PACKAGE
);
2242 initialize_exit_failure (SORT_FAILURE
);
2243 atexit (close_stdout
);
2245 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
2246 #if HAVE_NL_LANGINFO
2247 hard_LC_TIME
= hard_locale (LC_TIME
);
2251 /* Let's get locale's representation of the decimal point */
2253 struct lconv
const *lconvp
= localeconv ();
2255 /* If the locale doesn't define a decimal point, or if the decimal
2256 point is multibyte, use the C decimal point. We don't support
2257 multibyte decimal points yet. */
2258 decimal_point
= *lconvp
->decimal_point
;
2259 if (! decimal_point
|| lconvp
->decimal_point
[1])
2260 decimal_point
= C_DECIMAL_POINT
;
2262 /* We don't support multibyte thousands separators yet. */
2263 th_sep
= *lconvp
->thousands_sep
;
2264 if (! th_sep
|| lconvp
->thousands_sep
[1])
2265 th_sep
= CHAR_MAX
+ 1;
2269 have_read_stdin
= false;
2275 sigemptyset (&caught_signals
);
2276 for (i
= 0; i
< nsigs
; i
++)
2277 sigaddset (&caught_signals
, sigs
[i
]);
2278 newact
.sa_handler
= sighandler
;
2279 newact
.sa_mask
= caught_signals
;
2280 newact
.sa_flags
= 0;
2286 for (i
= 0; i
< nsigs
; i
++)
2290 sigaction (sig
, NULL
, &oldact
);
2291 if (oldact
.sa_handler
!= SIG_IGN
)
2292 sigaction (sig
, &newact
, NULL
);
2294 if (signal (sig
, SIG_IGN
) != SIG_IGN
)
2295 signal (sig
, sighandler
);
2300 gkey
.sword
= gkey
.eword
= SIZE_MAX
;
2302 gkey
.translate
= NULL
;
2303 gkey
.numeric
= gkey
.general_numeric
= gkey
.month
= gkey
.reverse
= false;
2304 gkey
.skipsblanks
= gkey
.skipeblanks
= false;
2306 files
= xnmalloc (argc
, sizeof *files
);
2310 /* Parse an operand as a file after "--" was seen; or if
2311 pedantic and a file was seen, unless the POSIX version
2312 predates 1003.1-2001 and -c was not seen and the operand is
2313 "-o FILE" or "-oFILE". */
2316 || (posixly_correct
&& nfiles
!= 0
2317 && ! (obsolete_usage
2320 && argv
[optind
][0] == '-' && argv
[optind
][1] == 'o'
2321 && (argv
[optind
][2] || optind
+ 1 != argc
)))
2322 || ((c
= getopt_long (argc
, argv
, short_options
,
2323 long_options
, NULL
))
2328 files
[nfiles
++] = argv
[optind
++];
2334 if (obsolete_usage
&& optarg
[0] == '+')
2336 /* Treat +POS1 [-POS2] as a key if possible; but silently
2337 treat an operand as a file if it is not a valid +POS1. */
2339 s
= parse_field_count (optarg
+ 1, &key
->sword
, NULL
);
2341 s
= parse_field_count (s
+ 1, &key
->schar
, NULL
);
2342 if (! (key
->sword
| key
->schar
))
2343 key
->sword
= SIZE_MAX
;
2344 if (! s
|| *set_ordering (s
, key
, bl_start
))
2351 if (optind
!= argc
&& argv
[optind
][0] == '-'
2352 && ISDIGIT (argv
[optind
][1]))
2354 char const *optarg1
= argv
[optind
++];
2355 s
= parse_field_count (optarg1
+ 1, &key
->eword
,
2356 N_("invalid number after `-'"));
2358 s
= parse_field_count (s
+ 1, &key
->echar
,
2359 N_("invalid number after `.'"));
2360 if (*set_ordering (s
, key
, bl_end
))
2361 badfieldspec (optarg1
,
2362 N_("stray character in field spec"));
2368 files
[nfiles
++] = optarg
;
2383 set_ordering (str
, &gkey
, bl_both
);
2395 s
= parse_field_count (optarg
, &key
->sword
,
2396 N_("invalid number at field start"));
2399 /* Provoke with `sort -k0' */
2400 badfieldspec (optarg
, N_("field number is zero"));
2404 s
= parse_field_count (s
+ 1, &key
->schar
,
2405 N_("invalid number after `.'"));
2408 /* Provoke with `sort -k1.0' */
2409 badfieldspec (optarg
, N_("character offset is zero"));
2412 if (! (key
->sword
| key
->schar
))
2413 key
->sword
= SIZE_MAX
;
2414 s
= set_ordering (s
, key
, bl_start
);
2417 key
->eword
= SIZE_MAX
;
2423 s
= parse_field_count (s
+ 1, &key
->eword
,
2424 N_("invalid number after `,'"));
2427 /* Provoke with `sort -k1,0' */
2428 badfieldspec (optarg
, N_("field number is zero"));
2431 s
= parse_field_count (s
+ 1, &key
->echar
,
2432 N_("invalid number after `.'"));
2435 /* `-k 2,3' is equivalent to `+1 -3'. */
2438 s
= set_ordering (s
, key
, bl_end
);
2441 badfieldspec (optarg
, N_("stray character in field spec"));
2450 if (outfile
!= minus
&& strcmp (outfile
, optarg
) != 0)
2451 error (SORT_FAILURE
, 0, _("multiple output files specified"));
2460 specify_sort_size (optarg
);
2465 int newtab
= optarg
[0];
2467 error (SORT_FAILURE
, 0, _("empty tab"));
2470 if (strcmp (optarg
, "\\0") == 0)
2474 /* Provoke with `sort -txx'. Complain about
2475 "multi-character tab" instead of "multibyte tab", so
2476 that the diagnostic's wording does not need to be
2477 changed once multibyte characters are supported. */
2478 error (SORT_FAILURE
, 0, _("multi-character tab `%s'"),
2482 if (tab
!= TAB_DEFAULT
&& tab
!= newtab
)
2483 error (SORT_FAILURE
, 0, _("incompatible tabs"));
2489 add_temp_dir (optarg
);
2497 /* Accept and ignore e.g. -y0 for compatibility with Solaris
2498 2.x through Solaris 7. -y is marked as obsolete starting
2506 case_GETOPT_HELP_CHAR
;
2508 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2511 usage (SORT_FAILURE
);
2515 /* Inheritance of global options to individual keys. */
2516 for (key
= keylist
; key
; key
= key
->next
)
2517 if (! (key
->ignore
|| key
->translate
2518 || (key
->skipsblanks
| key
->reverse
2519 | key
->skipeblanks
| key
->month
| key
->numeric
2520 | key
->general_numeric
)))
2522 key
->ignore
= gkey
.ignore
;
2523 key
->translate
= gkey
.translate
;
2524 key
->skipsblanks
= gkey
.skipsblanks
;
2525 key
->skipeblanks
= gkey
.skipeblanks
;
2526 key
->month
= gkey
.month
;
2527 key
->numeric
= gkey
.numeric
;
2528 key
->general_numeric
= gkey
.general_numeric
;
2529 key
->reverse
= gkey
.reverse
;
2532 if (!keylist
&& (gkey
.ignore
|| gkey
.translate
2533 || (gkey
.skipsblanks
| gkey
.skipeblanks
| gkey
.month
2534 | gkey
.numeric
| gkey
.general_numeric
)))
2536 reverse
= gkey
.reverse
;
2538 if (temp_dir_count
== 0)
2540 char const *tmp_dir
= getenv ("TMPDIR");
2541 add_temp_dir (tmp_dir
? tmp_dir
: DEFAULT_TMPDIR
);
2553 error (SORT_FAILURE
, 0, _("extra operand `%s' not allowed with -c"),
2556 /* POSIX requires that sort return 1 IFF invoked with -c and the
2557 input is not properly sorted. */
2558 exit (check (files
[0]) ? EXIT_SUCCESS
: SORT_OUT_OF_ORDER
);
2563 int max_merge
= first_same_file (files
, MIN (nfiles
, NMERGE
), outfile
);
2564 merge (files
, nfiles
, max_merge
, outfile
);
2567 sort (files
, nfiles
, outfile
);
2569 if (have_read_stdin
&& fclose (stdin
) == EOF
)
2570 die (_("close failed"), "-");
2572 exit (EXIT_SUCCESS
);