src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 88, 1991-2004 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17
  18    Written December 1988 by Mike Haertel.
  19    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  20    or (US mail) as Mike Haertel c/o Free Software Foundation.
  21
  22    Ørn E. Hansen added NLS support in 1997.  */
  23
  24 #include <config.h>
  25
  26 #include <getopt.h>
  27 #include <sys/types.h>
  28 #include <signal.h>
  29 #include <stdio.h>
  30 #include "system.h"
  31 #include "error.h"
  32 #include "hard-locale.h"
  33 #include "inttostr.h"
  34 #include "physmem.h"
  35 #include "posixver.h"
  36 #include "quote.h"
  37 #include "stdio-safer.h"
  38 #include "xmemcoll.h"
  39 #include "xstrtol.h"
  40
  41 #if HAVE_SYS_RESOURCE_H
  42 # include <sys/resource.h>
  43 #endif
  44 #ifndef RLIMIT_DATA
  45 struct rlimit { size_t rlim_cur; };
  46 # define getrlimit(Resource, Rlp) (-1)
  47 #endif
  48
  49 /* The official name of this program (e.g., no `g' prefix).  */
  50 #define PROGRAM_NAME "sort"
  51
  52 #define AUTHORS "Mike Haertel", "Paul Eggert"
  53
  54 #if HAVE_LANGINFO_CODESET
  55 # include <langinfo.h>
  56 #endif
  57
  58 #ifndef SA_NOCLDSTOP
  59 # define sigprocmask(How, Set, Oset) /* empty */
  60 # define sigset_t int
  61 #endif
  62
  63 #ifndef STDC_HEADERS
  64 double strtod ();
  65 #endif
  66
  67 #define UCHAR_LIM (UCHAR_MAX + 1)
  68
  69 #ifndef DEFAULT_TMPDIR
  70 # define DEFAULT_TMPDIR "/tmp"
  71 #endif
  72
  73 /* Exit statuses.  */
  74 enum
  75   {
  76     /* POSIX says to exit with status 1 if invoked with -c and the
  77        input is not properly sorted.  */
  78     SORT_OUT_OF_ORDER = 1,
  79
  80     /* POSIX says any other irregular exit must exit with a status
  81        code greater than 1.  */
  82     SORT_FAILURE = 2
  83   };
  84
  85 #define NEGATION_SIGN   '-'
  86 #define NUMERIC_ZERO    '0'
  87
  88 /* The representation of the decimal point in the current locale.  */
  89 static char decimal_point;
  90
  91 /* Thousands separator; if CHAR_MAX + 1, then there isn't one.  */
  92 static int thousands_sep;
  93
  94 /* Nonzero if the corresponding locales are hard.  */
  95 static bool hard_LC_COLLATE;
  96 #if HAVE_NL_LANGINFO
  97 static bool hard_LC_TIME;
  98 #endif
  99
 100 #define NONZERO(x) ((x) != 0)
 101
 102 /* The kind of blanks for '-b' to skip in various options. */
 103 enum blanktype { bl_start, bl_end, bl_both };
 104
 105 /* The character marking end of line. Default to \n. */
 106 static char eolchar = '\n';
 107
 108 /* Lines are held in core as counted strings. */
 109 struct line
 110 {
 111   char *text;                   /* Text of the line. */
 112   size_t length;                /* Length including final newline. */
 113   char *keybeg;                 /* Start of first key. */
 114   char *keylim;                 /* Limit of first key. */
 115 };
 116
 117 /* Input buffers. */
 118 struct buffer
 119 {
 120   char *buf;                    /* Dynamically allocated buffer,
 121                                    partitioned into 3 regions:
 122                                    - input data;
 123                                    - unused area;
 124                                    - an array of lines, in reverse order.  */
 125   size_t used;                  /* Number of bytes used for input data.  */
 126   size_t nlines;                /* Number of lines in the line array.  */
 127   size_t alloc;                 /* Number of bytes allocated. */
 128   size_t left;                  /* Number of bytes left from previous reads. */
 129   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 130   bool eof;                     /* An EOF has been read.  */
 131 };
 132
 133 struct keyfield
 134 {
 135   size_t sword;                 /* Zero-origin 'word' to start at. */
 136   size_t schar;                 /* Additional characters to skip. */
 137   size_t eword;                 /* Zero-origin first word after field. */
 138   size_t echar;                 /* Additional characters in field. */
 139   bool const *ignore;           /* Boolean array of characters to ignore. */
 140   char const *translate;        /* Translation applied to characters. */
 141   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 142   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 143   bool numeric;                 /* Flag for numeric comparison.  Handle
 144                                    strings of digits with optional decimal
 145                                    point, but no exponential notation. */
 146   bool general_numeric;         /* Flag for general, numeric comparison.
 147                                    Handle numbers in exponential notation. */
 148   bool month;                   /* Flag for comparison by month name. */
 149   bool reverse;                 /* Reverse the sense of comparison. */
 150   struct keyfield *next;        /* Next keyfield to try. */
 151 };
 152
 153 struct month
 154 {
 155   char const *name;
 156   int val;
 157 };
 158
 159 /* The name this program was run with. */
 160 char *program_name;
 161
 162 /* FIXME: None of these tables work with multibyte character sets.
 163    Also, there are many other bugs when handling multibyte characters.
 164    One way to fix this is to rewrite `sort' to use wide characters
 165    internally, but doing this with good performance is a bit
 166    tricky.  */
 167
 168 /* Table of blanks.  */
 169 static bool blanks[UCHAR_LIM];
 170
 171 /* Table of non-printing characters. */
 172 static bool nonprinting[UCHAR_LIM];
 173
 174 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 175 static bool nondictionary[UCHAR_LIM];
 176
 177 /* Translation table folding lower case to upper.  */
 178 static char fold_toupper[UCHAR_LIM];
 179
 180 #define MONTHS_PER_YEAR 12
 181
 182 /* Table mapping month names to integers.
 183    Alphabetic order allows binary search. */
 184 static struct month monthtab[] =
 185 {
 186   {"APR", 4},
 187   {"AUG", 8},
 188   {"DEC", 12},
 189   {"FEB", 2},
 190   {"JAN", 1},
 191   {"JUL", 7},
 192   {"JUN", 6},
 193   {"MAR", 3},
 194   {"MAY", 5},
 195   {"NOV", 11},
 196   {"OCT", 10},
 197   {"SEP", 9}
 198 };
 199
 200 /* During the merge phase, the number of files to merge at once. */
 201 #define NMERGE 16
 202
 203 /* Minimum size for a merge or check buffer.  */
 204 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 205
 206 /* Minimum sort size; the code might not work with smaller sizes.  */
 207 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
 208
 209 /* The number of bytes needed for a merge or check buffer, which can
 210    function relatively efficiently even if it holds only one line.  If
 211    a longer line is seen, this value is increased.  */
 212 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 213
 214 /* The approximate maximum number of bytes of main memory to use, as
 215    specified by the user.  Zero if the user has not specified a size.  */
 216 static size_t sort_size;
 217
 218 /* The guessed size for non-regular files.  */
 219 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 220
 221 /* Array of directory names in which any temporary files are to be created. */
 222 static char const **temp_dirs;
 223
 224 /* Number of temporary directory names used.  */
 225 static size_t temp_dir_count;
 226
 227 /* Number of allocated slots in temp_dirs.  */
 228 static size_t temp_dir_alloc;
 229
 230 /* Flag to reverse the order of all comparisons. */
 231 static bool reverse;
 232
 233 /* Flag for stable sort.  This turns off the last ditch bytewise
 234    comparison of lines, and instead leaves lines in the same order
 235    they were read if all keys compare equal.  */
 236 static bool stable;
 237
 238 /* If TAB has this value, blanks separate fields.  */
 239 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 240
 241 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 242    separated by the empty string between a non-blank character and a blank
 243    character. */
 244 static int tab = TAB_DEFAULT;
 245
 246 /* Flag to remove consecutive duplicate lines from the output.
 247    Only the last of a sequence of equal lines will be output. */
 248 static bool unique;
 249
 250 /* Nonzero if any of the input files are the standard input. */
 251 static bool have_read_stdin;
 252
 253 /* List of key field comparisons to be tried.  */
 254 static struct keyfield *keylist;
 255
 256 static void sortlines_temp (struct line *, size_t, struct line *);
 257
 258 /* Report MESSAGE for FILE, then clean up and exit.
 259    If FILE is null, it represents standard output.  */
 260
 261 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 262 static void
 263 die (char const *message, char const *file)
 264 {
 265   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 266   exit (SORT_FAILURE);
 267 }
 268
 269 void
 270 usage (int status)
 271 {
 272   if (status != EXIT_SUCCESS)
 273     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 274              program_name);
 275   else
 276     {
 277       printf (_("\
 278 Usage: %s [OPTION]... [FILE]...\n\
 279 "),
 280               program_name);
 281       fputs (_("\
 282 Write sorted concatenation of all FILE(s) to standard output.\n\
 283 \n\
 284 Ordering options:\n\
 285 \n\
 286 "), stdout);
 287       fputs (_("\
 288 Mandatory arguments to long options are mandatory for short options too.\n\
 289 "), stdout);
 290       fputs (_("\
 291   -b, --ignore-leading-blanks ignore leading blanks\n\
 292   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 293   -f, --ignore-case           fold lower case to upper case characters\n\
 294 "), stdout);
 295       fputs (_("\
 296   -g, --general-numeric-sort  compare according to general numerical value\n\
 297   -i, --ignore-nonprinting    consider only printable characters\n\
 298   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 299   -n, --numeric-sort          compare according to string numerical value\n\
 300   -r, --reverse               reverse the result of comparisons\n\
 301 \n\
 302 "), stdout);
 303       fputs (_("\
 304 Other options:\n\
 305 \n\
 306   -c, --check               check whether input is sorted; do not sort\n\
 307   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS 2 (origin 1)\n\
 308   -m, --merge               merge already sorted files; do not sort\n\
 309   -o, --output=FILE         write result to FILE instead of standard output\n\
 310   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 311   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 312 "), stdout);
 313       printf (_("\
 314   -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
 315   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 316                               multiple options specify multiple directories\n\
 317   -u, --unique              with -c, check for strict ordering;\n\
 318                               without -c, output only the first of an equal run\n\
 319 "), DEFAULT_TMPDIR);
 320       fputs (_("\
 321   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 322 "), stdout);
 323       fputs (HELP_OPTION_DESCRIPTION, stdout);
 324       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 325       fputs (_("\
 326 \n\
 327 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 328 in the field.  OPTS is one or more single-letter ordering options, which\n\
 329 override global ordering options for that key.  If no key is given, use the\n\
 330 entire line as the key.\n\
 331 \n\
 332 SIZE may be followed by the following multiplicative suffixes:\n\
 333 "), stdout);
 334       fputs (_("\
 335 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 336 \n\
 337 With no FILE, or when FILE is -, read standard input.\n\
 338 \n\
 339 *** WARNING ***\n\
 340 The locale specified by the environment affects sort order.\n\
 341 Set LC_ALL=C to get the traditional sort order that uses\n\
 342 native byte values.\n\
 343 "), stdout );
 344       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 345     }
 346
 347   exit (status);
 348 }
 349
 350 #define COMMON_SHORT_OPTIONS "-bcdfgik:mMno:rsS:t:T:uz"
 351
 352 static struct option const long_options[] =
 353 {
 354   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 355   {"check", no_argument, NULL, 'c'},
 356   {"dictionary-order", no_argument, NULL, 'd'},
 357   {"ignore-case", no_argument, NULL, 'f'},
 358   {"general-numeric-sort", no_argument, NULL, 'g'},
 359   {"ignore-nonprinting", no_argument, NULL, 'i'},
 360   {"key", required_argument, NULL, 'k'},
 361   {"merge", no_argument, NULL, 'm'},
 362   {"month-sort", no_argument, NULL, 'M'},
 363   {"numeric-sort", no_argument, NULL, 'n'},
 364   {"output", required_argument, NULL, 'o'},
 365   {"reverse", no_argument, NULL, 'r'},
 366   {"stable", no_argument, NULL, 's'},
 367   {"buffer-size", required_argument, NULL, 'S'},
 368   {"field-separator", required_argument, NULL, 't'},
 369   {"temporary-directory", required_argument, NULL, 'T'},
 370   {"unique", no_argument, NULL, 'u'},
 371   {"zero-terminated", no_argument, NULL, 'z'},
 372   {GETOPT_HELP_OPTION_DECL},
 373   {GETOPT_VERSION_OPTION_DECL},
 374   {0, 0, 0, 0},
 375 };
 376
 377 /* The set of signals that are caught.  */
 378 static sigset_t caught_signals;
 379
 380 /* The list of temporary files. */
 381 struct tempnode
 382 {
 383   struct tempnode *volatile next;
 384   char name[1];  /* Actual size is 1 + file name length.  */
 385 };
 386 static struct tempnode *volatile temphead;
 387 static struct tempnode *volatile *temptail = &temphead;
 388
 389 /* Clean up any remaining temporary files.  */
 390
 391 static void
 392 cleanup (void)
 393 {
 394   struct tempnode const *node;
 395
 396   for (node = temphead; node; node = node->next)
 397     unlink (node->name);
 398 }
 399
 400 /* Create a new temporary file, returning its newly allocated name.
 401    Store into *PFP a stream open for writing.  */
 402
 403 static char *
 404 create_temp_file (FILE **pfp)
 405 {
 406   static char const slashbase[] = "/sortXXXXXX";
 407   static size_t temp_dir_index;
 408   sigset_t oldset;
 409   int fd;
 410   int saved_errno;
 411   char const *temp_dir = temp_dirs[temp_dir_index];
 412   size_t len = strlen (temp_dir);
 413   struct tempnode *node =
 414     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 415   char *file = node->name;
 416
 417   memcpy (file, temp_dir, len);
 418   memcpy (file + len, slashbase, sizeof slashbase);
 419   node->next = NULL;
 420   if (++temp_dir_index == temp_dir_count)
 421     temp_dir_index = 0;
 422
 423   /* Create the temporary file in a critical section, to avoid races.  */
 424   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 425   fd = mkstemp (file);
 426   if (0 <= fd)
 427     {
 428       *temptail = node;
 429       temptail = &node->next;
 430     }
 431   saved_errno = errno;
 432   sigprocmask (SIG_SETMASK, &oldset, NULL);
 433   errno = saved_errno;
 434
 435   if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
 436     die (_("cannot create temporary file"), file);
 437
 438   return file;
 439 }
 440
 441 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 442    standard output; HOW should be "w".  When opening for input, "-"
 443    means standard input.  To avoid confusion, do not return file
 444    descriptors 0, 1, or 2.  */
 445
 446 static FILE *
 447 xfopen (const char *file, const char *how)
 448 {
 449   FILE *fp;
 450
 451   if (!file)
 452     fp = stdout;
 453   else if (STREQ (file, "-") && *how == 'r')
 454     {
 455       have_read_stdin = true;
 456       fp = stdin;
 457     }
 458   else
 459     {
 460       if ((fp = fopen_safer (file, how)) == NULL)
 461         die (_("open failed"), file);
 462     }
 463
 464   return fp;
 465 }
 466
 467 /* Close FP, whose name is FILE, and report any errors.  */
 468
 469 static void
 470 xfclose (FILE *fp, char const *file)
 471 {
 472   if (fp == stdin)
 473     {
 474       /* Allow reading stdin from tty more than once. */
 475       if (feof (fp))
 476         clearerr (fp);
 477     }
 478   else if (fp == stdout)
 479     {
 480       /* Don't close stdout just yet.  close_stdout does that.  */
 481       if (fflush (fp) != 0)
 482         die (_("fflush failed"), file);
 483     }
 484   else
 485     {
 486       if (fclose (fp) != 0)
 487         die (_("close failed"), file);
 488     }
 489 }
 490
 491 static void
 492 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 493 {
 494   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 495     die (_("write failed"), output_file);
 496 }
 497
 498 /* Append DIR to the array of temporary directory names.  */
 499 static void
 500 add_temp_dir (char const *dir)
 501 {
 502   if (temp_dir_count == temp_dir_alloc)
 503     temp_dirs = x2nrealloc (temp_dirs, &temp_dir_alloc, sizeof *temp_dirs);
 504
 505   temp_dirs[temp_dir_count++] = dir;
 506 }
 507
 508 /* Remove NAME from the list of temporary files.  */
 509
 510 static void
 511 zaptemp (const char *name)
 512 {
 513   struct tempnode *volatile *pnode;
 514   struct tempnode *node;
 515   struct tempnode *next;
 516   sigset_t oldset;
 517   int unlink_status;
 518   int unlink_errno = 0;
 519
 520   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
 521     continue;
 522
 523   /* Unlink the temporary file in a critical section to avoid races.  */
 524   next = node->next;
 525   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 526   unlink_status = unlink (name);
 527   unlink_errno = errno;
 528   *pnode = next;
 529   sigprocmask (SIG_SETMASK, &oldset, NULL);
 530
 531   if (unlink_status != 0)
 532     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
 533   if (! next)
 534     temptail = pnode;
 535   free (node);
 536 }
 537
 538 #if HAVE_NL_LANGINFO
 539
 540 static int
 541 struct_month_cmp (const void *m1, const void *m2)
 542 {
 543   struct month const *month1 = m1;
 544   struct month const *month2 = m2;
 545   return strcmp (month1->name, month2->name);
 546 }
 547
 548 #endif
 549
 550 /* Initialize the character class tables. */
 551
 552 static void
 553 inittables (void)
 554 {
 555   size_t i;
 556
 557   for (i = 0; i < UCHAR_LIM; ++i)
 558     {
 559       blanks[i] = !!ISBLANK (i);
 560       nonprinting[i] = !ISPRINT (i);
 561       nondictionary[i] = !ISALNUM (i) && !ISBLANK (i);
 562       fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
 563     }
 564
 565 #if HAVE_NL_LANGINFO
 566   /* If we're not in the "C" locale, read different names for months.  */
 567   if (hard_LC_TIME)
 568     {
 569       for (i = 0; i < MONTHS_PER_YEAR; i++)
 570         {
 571           char const *s;
 572           size_t s_len;
 573           size_t j;
 574           char *name;
 575
 576           s = (char *) nl_langinfo (ABMON_1 + i);
 577           s_len = strlen (s);
 578           monthtab[i].name = name = xmalloc (s_len + 1);
 579           monthtab[i].val = i + 1;
 580
 581           for (j = 0; j < s_len; j++)
 582             name[j] = fold_toupper[to_uchar (s[j])];
 583           name[j] = '\0';
 584         }
 585       qsort ((void *) monthtab, MONTHS_PER_YEAR,
 586              sizeof *monthtab, struct_month_cmp);
 587     }
 588 #endif
 589 }
 590
 591 /* Specify the amount of main memory to use when sorting.  */
 592 static void
 593 specify_sort_size (char const *s)
 594 {
 595   uintmax_t n;
 596   char *suffix;
 597   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
 598
 599   /* The default unit is KiB.  */
 600   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
 601     {
 602       if (n <= UINTMAX_MAX / 1024)
 603         n *= 1024;
 604       else
 605         e = LONGINT_OVERFLOW;
 606     }
 607
 608   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
 609   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
 610     switch (suffix[0])
 611       {
 612       case 'b':
 613         e = LONGINT_OK;
 614         break;
 615
 616       case '%':
 617         {
 618           double mem = physmem_total () * n / 100;
 619
 620           /* Use "<", not "<=", to avoid problems with rounding.  */
 621           if (mem < UINTMAX_MAX)
 622             {
 623               n = mem;
 624               e = LONGINT_OK;
 625             }
 626           else
 627             e = LONGINT_OVERFLOW;
 628         }
 629         break;
 630       }
 631
 632   if (e == LONGINT_OK)
 633     {
 634       /* If multiple sort sizes are specified, take the maximum, so
 635          that option order does not matter.  */
 636       if (n < sort_size)
 637         return;
 638
 639       sort_size = n;
 640       if (sort_size == n)
 641         {
 642           sort_size = MAX (sort_size, MIN_SORT_SIZE);
 643           return;
 644         }
 645
 646       e = LONGINT_OVERFLOW;
 647     }
 648
 649   STRTOL_FATAL_ERROR (s, _("sort size"), e);
 650 }
 651
 652 /* Return the default sort size.  */
 653 static size_t
 654 default_sort_size (void)
 655 {
 656   /* Let MEM be available memory or 1/8 of total memory, whichever
 657      is greater.  */
 658   double avail = physmem_available ();
 659   double total = physmem_total ();
 660   double mem = MAX (avail, total / 8);
 661   struct rlimit rlimit;
 662
 663   /* Let SIZE be MEM, but no more than the maximum object size or
 664      system resource limits.  Avoid the MIN macro here, as it is not
 665      quite right when only one argument is floating point.  Don't
 666      bother to check for values like RLIM_INFINITY since in practice
 667      they are not much less than SIZE_MAX.  */
 668   size_t size = SIZE_MAX;
 669   if (mem < size)
 670     size = mem;
 671   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
 672     size = rlimit.rlim_cur;
 673 #ifdef RLIMIT_AS
 674   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
 675     size = rlimit.rlim_cur;
 676 #endif
 677
 678   /* Leave a large safety margin for the above limits, as failure can
 679      occur when they are exceeded.  */
 680   size /= 2;
 681
 682 #ifdef RLIMIT_RSS
 683   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
 684      Exceeding RSS is not fatal, but can be quite slow.  */
 685   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
 686     size = rlimit.rlim_cur / 16 * 15;
 687 #endif
 688
 689   /* Use no less than the minimum.  */
 690   return MAX (size, MIN_SORT_SIZE);
 691 }
 692
 693 /* Return the sort buffer size to use with the input files identified
 694    by FPS and FILES, which are alternate paths to the same files.
 695    NFILES gives the number of input files; NFPS may be less.  Assume
 696    that each input line requires LINE_BYTES extra bytes' worth of line
 697    information.  Do not exceed a bound on the size: if the bound is
 698    not specified by the user, use a default.  */
 699
 700 static size_t
 701 sort_buffer_size (FILE *const *fps, size_t nfps,
 702                   char *const *files, size_t nfiles,
 703                   size_t line_bytes)
 704 {
 705   /* A bound on the input size.  If zero, the bound hasn't been
 706      determined yet.  */
 707   static size_t size_bound;
 708
 709   /* In the worst case, each input byte is a newline.  */
 710   size_t worst_case_per_input_byte = line_bytes + 1;
 711
 712   /* Keep enough room for one extra input line and an extra byte.
 713      This extra room might be needed when preparing to read EOF.  */
 714   size_t size = worst_case_per_input_byte + 1;
 715
 716   size_t i;
 717
 718   for (i = 0; i < nfiles; i++)
 719     {
 720       struct stat st;
 721       off_t file_size;
 722       size_t worst_case;
 723
 724       if ((i < nfps ? fstat (fileno (fps[i]), &st)
 725            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
 726            : stat (files[i], &st))
 727           != 0)
 728         die (_("stat failed"), files[i]);
 729
 730       if (S_ISREG (st.st_mode))
 731         file_size = st.st_size;
 732       else
 733         {
 734           /* The file has unknown size.  If the user specified a sort
 735              buffer size, use that; otherwise, guess the size.  */
 736           if (sort_size)
 737             return sort_size;
 738           file_size = INPUT_FILE_SIZE_GUESS;
 739         }
 740
 741       if (! size_bound)
 742         {
 743           size_bound = sort_size;
 744           if (! size_bound)
 745             size_bound = default_sort_size ();
 746         }
 747
 748       /* Add the amount of memory needed to represent the worst case
 749          where the input consists entirely of newlines followed by a
 750          single non-newline.  Check for overflow.  */
 751       worst_case = file_size * worst_case_per_input_byte + 1;
 752       if (file_size != worst_case / worst_case_per_input_byte
 753           || size_bound - size <= worst_case)
 754         return size_bound;
 755       size += worst_case;
 756     }
 757
 758   return size;
 759 }
 760
 761 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
 762    must be at least sizeof (struct line).  Allocate ALLOC bytes
 763    initially.  */
 764
 765 static void
 766 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
 767 {
 768   /* Ensure that the line array is properly aligned.  If the desired
 769      size cannot be allocated, repeatedly halve it until allocation
 770      succeeds.  The smaller allocation may hurt overall performance,
 771      but that's better than failing.  */
 772   for (;;)
 773     {
 774       alloc += sizeof (struct line) - alloc % sizeof (struct line);
 775       buf->buf = malloc (alloc);
 776       if (buf->buf)
 777         break;
 778       alloc /= 2;
 779       if (alloc <= line_bytes + 1)
 780         xalloc_die ();
 781     }
 782
 783   buf->line_bytes = line_bytes;
 784   buf->alloc = alloc;
 785   buf->used = buf->left = buf->nlines = 0;
 786   buf->eof = false;
 787 }
 788
 789 /* Return one past the limit of the line array.  */
 790
 791 static inline struct line *
 792 buffer_linelim (struct buffer const *buf)
 793 {
 794   return (struct line *) (buf->buf + buf->alloc);
 795 }
 796
 797 /* Return a pointer to the first character of the field specified
 798    by KEY in LINE. */
 799
 800 static char *
 801 begfield (const struct line *line, const struct keyfield *key)
 802 {
 803   register char *ptr = line->text, *lim = ptr + line->length - 1;
 804   register size_t sword = key->sword;
 805   register size_t schar = key->schar;
 806   register size_t remaining_bytes;
 807
 808   /* The leading field separator itself is included in a field when -t
 809      is absent.  */
 810
 811   if (tab != TAB_DEFAULT)
 812     while (ptr < lim && sword--)
 813       {
 814         while (ptr < lim && *ptr != tab)
 815           ++ptr;
 816         if (ptr < lim)
 817           ++ptr;
 818       }
 819   else
 820     while (ptr < lim && sword--)
 821       {
 822         while (ptr < lim && blanks[to_uchar (*ptr)])
 823           ++ptr;
 824         while (ptr < lim && !blanks[to_uchar (*ptr)])
 825           ++ptr;
 826       }
 827
 828   if (key->skipsblanks)
 829     while (ptr < lim && blanks[to_uchar (*ptr)])
 830       ++ptr;
 831
 832   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
 833   remaining_bytes = lim - ptr;
 834   if (schar < remaining_bytes)
 835     ptr += schar;
 836   else
 837     ptr = lim;
 838
 839   return ptr;
 840 }
 841
 842 /* Return the limit of (a pointer to the first character after) the field
 843    in LINE specified by KEY. */
 844
 845 static char *
 846 limfield (const struct line *line, const struct keyfield *key)
 847 {
 848   register char *ptr = line->text, *lim = ptr + line->length - 1;
 849   register size_t eword = key->eword, echar = key->echar;
 850   register size_t remaining_bytes;
 851
 852   /* Move PTR past EWORD fields or to one past the last byte on LINE,
 853      whichever comes first.  If there are more than EWORD fields, leave
 854      PTR pointing at the beginning of the field having zero-based index,
 855      EWORD.  If a delimiter character was specified (via -t), then that
 856      `beginning' is the first character following the delimiting TAB.
 857      Otherwise, leave PTR pointing at the first `blank' character after
 858      the preceding field.  */
 859   if (tab != TAB_DEFAULT)
 860     while (ptr < lim && eword--)
 861       {
 862         while (ptr < lim && *ptr != tab)
 863           ++ptr;
 864         if (ptr < lim && (eword | echar))
 865           ++ptr;
 866       }
 867   else
 868     while (ptr < lim && eword--)
 869       {
 870         while (ptr < lim && blanks[to_uchar (*ptr)])
 871           ++ptr;
 872         while (ptr < lim && !blanks[to_uchar (*ptr)])
 873           ++ptr;
 874       }
 875
 876 #ifdef POSIX_UNSPECIFIED
 877   /* The following block of code makes GNU sort incompatible with
 878      standard Unix sort, so it's ifdef'd out for now.
 879      The POSIX spec isn't clear on how to interpret this.
 880      FIXME: request clarification.
 881
 882      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
 883      Date: Thu, 30 May 96 12:20:41 -0400
 884      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
 885
 886      [...]I believe I've found another bug in `sort'.
 887
 888      $ cat /tmp/sort.in
 889      a b c 2 d
 890      pq rs 1 t
 891      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
 892      a b c 2 d
 893      pq rs 1 t
 894      $ /bin/sort -k1.7,1.7 </tmp/sort.in
 895      pq rs 1 t
 896      a b c 2 d
 897
 898      Unix sort produced the answer I expected: sort on the single character
 899      in column 7.  GNU sort produced different results, because it disagrees
 900      on the interpretation of the key-end spec "M.N".  Unix sort reads this
 901      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
 902      "skip M-1 fields, then either N-1 characters or the rest of the current
 903      field, whichever comes first".  This extra clause applies only to
 904      key-ends, not key-starts.
 905      */
 906
 907   /* Make LIM point to the end of (one byte past) the current field.  */
 908   if (tab != TAB_DEFAULT)
 909     {
 910       char *newlim;
 911       newlim = memchr (ptr, tab, lim - ptr);
 912       if (newlim)
 913         lim = newlim;
 914     }
 915   else
 916     {
 917       char *newlim;
 918       newlim = ptr;
 919       while (newlim < lim && blanks[to_uchar (*newlim)])
 920         ++newlim;
 921       while (newlim < lim && !blanks[to_uchar (*newlim)])
 922         ++newlim;
 923       lim = newlim;
 924     }
 925 #endif
 926
 927   /* If we're ignoring leading blanks when computing the End
 928      of the field, don't start counting bytes until after skipping
 929      past any leading blanks. */
 930   if (key->skipeblanks)
 931     while (ptr < lim && blanks[to_uchar (*ptr)])
 932       ++ptr;
 933
 934   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
 935   remaining_bytes = lim - ptr;
 936   if (echar < remaining_bytes)
 937     ptr += echar;
 938   else
 939     ptr = lim;
 940
 941   return ptr;
 942 }
 943
 944 /* Fill BUF reading from FP, moving buf->left bytes from the end
 945    of buf->buf to the beginning first.  If EOF is reached and the
 946    file wasn't terminated by a newline, supply one.  Set up BUF's line
 947    table too.  FILE is the name of the file corresponding to FP.
 948    Return true if some input was read.  */
 949
 950 static bool
 951 fillbuf (struct buffer *buf, register FILE *fp, char const *file)
 952 {
 953   struct keyfield const *key = keylist;
 954   char eol = eolchar;
 955   size_t line_bytes = buf->line_bytes;
 956   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
 957
 958   if (buf->eof)
 959     return false;
 960
 961   if (buf->used != buf->left)
 962     {
 963       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
 964       buf->used = buf->left;
 965       buf->nlines = 0;
 966     }
 967
 968   for (;;)
 969     {
 970       char *ptr = buf->buf + buf->used;
 971       struct line *linelim = buffer_linelim (buf);
 972       struct line *line = linelim - buf->nlines;
 973       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
 974       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
 975
 976       while (line_bytes + 1 < avail)
 977         {
 978           /* Read as many bytes as possible, but do not read so many
 979              bytes that there might not be enough room for the
 980              corresponding line array.  The worst case is when the
 981              rest of the input file consists entirely of newlines,
 982              except that the last byte is not a newline.  */
 983           size_t readsize = (avail - 1) / (line_bytes + 1);
 984           size_t bytes_read = fread (ptr, 1, readsize, fp);
 985           char *ptrlim = ptr + bytes_read;
 986           char *p;
 987           avail -= bytes_read;
 988
 989           if (bytes_read != readsize)
 990             {
 991               if (ferror (fp))
 992                 die (_("read failed"), file);
 993               if (feof (fp))
 994                 {
 995                   buf->eof = true;
 996                   if (buf->buf == ptrlim)
 997                     return false;
 998                   if (ptrlim[-1] != eol)
 999                     *ptrlim++ = eol;
1000                 }
1001             }
1002
1003           /* Find and record each line in the just-read input.  */
1004           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1005             {
1006               ptr = p + 1;
1007               line--;
1008               line->text = line_start;
1009               line->length = ptr - line_start;
1010               mergesize = MAX (mergesize, line->length);
1011               avail -= line_bytes;
1012
1013               if (key)
1014                 {
1015                   /* Precompute the position of the first key for
1016                      efficiency.  */
1017                   line->keylim = (key->eword == SIZE_MAX
1018                                   ? p
1019                                   : limfield (line, key));
1020
1021                   if (key->sword != SIZE_MAX)
1022                     line->keybeg = begfield (line, key);
1023                   else
1024                     {
1025                       if (key->skipsblanks)
1026                         while (blanks[to_uchar (*line_start)])
1027                           line_start++;
1028                       line->keybeg = line_start;
1029                     }
1030                 }
1031
1032               line_start = ptr;
1033             }
1034
1035           ptr = ptrlim;
1036           if (buf->eof)
1037             break;
1038         }
1039
1040       buf->used = ptr - buf->buf;
1041       buf->nlines = buffer_linelim (buf) - line;
1042       if (buf->nlines != 0)
1043         {
1044           buf->left = ptr - line_start;
1045           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1046           return true;
1047         }
1048
1049       /* The current input line is too long to fit in the buffer.
1050          Double the buffer size and try again.  */
1051       buf->buf = x2nrealloc (buf->buf, &buf->alloc, sizeof *(buf->buf));
1052     }
1053 }
1054
1055 /* Compare strings A and B containing decimal fractions < 1.  Each string
1056    should begin with a decimal point followed immediately by the digits
1057    of the fraction.  Strings not of this form are considered to be zero. */
1058
1059 /* The goal here, is to take two numbers a and b... compare these
1060    in parallel.  Instead of converting each, and then comparing the
1061    outcome.  Most likely stopping the comparison before the conversion
1062    is complete.  The algorithm used, in the old sort:
1063
1064    Algorithm: fraccompare
1065    Action   : compare two decimal fractions
1066    accepts  : char *a, char *b
1067    returns  : -1 if a<b, 0 if a=b, 1 if a>b.
1068    implement:
1069
1070    if *a == decimal_point AND *b == decimal_point
1071      find first character different in a and b.
1072      if both are digits, return the difference *a - *b.
1073      if *a is a digit
1074        skip past zeros
1075        if digit return 1, else 0
1076      if *b is a digit
1077        skip past zeros
1078        if digit return -1, else 0
1079    if *a is a decimal_point
1080      skip past decimal_point and zeros
1081      if digit return 1, else 0
1082    if *b is a decimal_point
1083      skip past decimal_point and zeros
1084      if digit return -1, else 0
1085    return 0 */
1086
1087 static int
1088 fraccompare (register const char *a, register const char *b)
1089 {
1090   if (*a == decimal_point && *b == decimal_point)
1091     {
1092       while (*++a == *++b)
1093         if (! ISDIGIT (*a))
1094           return 0;
1095       if (ISDIGIT (*a) && ISDIGIT (*b))
1096         return *a - *b;
1097       if (ISDIGIT (*a))
1098         goto a_trailing_nonzero;
1099       if (ISDIGIT (*b))
1100         goto b_trailing_nonzero;
1101       return 0;
1102     }
1103   else if (*a++ == decimal_point)
1104     {
1105     a_trailing_nonzero:
1106       while (*a == NUMERIC_ZERO)
1107         a++;
1108       return ISDIGIT (*a);
1109     }
1110   else if (*b++ == decimal_point)
1111     {
1112     b_trailing_nonzero:
1113       while (*b == NUMERIC_ZERO)
1114         b++;
1115       return - ISDIGIT (*b);
1116     }
1117   return 0;
1118 }
1119
1120 /* Compare strings A and B as numbers without explicitly converting them to
1121    machine numbers.  Comparatively slow for short strings, but asymptotically
1122    hideously fast. */
1123
1124 static int
1125 numcompare (register const char *a, register const char *b)
1126 {
1127   char tmpa;
1128   char tmpb;
1129   int tmp;
1130   size_t log_a;
1131   size_t log_b;
1132
1133   while (blanks[to_uchar (tmpa = *a)])
1134     a++;
1135   while (blanks[to_uchar (tmpb = *b)])
1136     b++;
1137
1138   if (tmpa == NEGATION_SIGN)
1139     {
1140       do
1141         tmpa = *++a;
1142       while (tmpa == NUMERIC_ZERO || tmpa == thousands_sep);
1143       if (tmpb != NEGATION_SIGN)
1144         {
1145           if (tmpa == decimal_point)
1146             do
1147               tmpa = *++a;
1148             while (tmpa == NUMERIC_ZERO);
1149           if (ISDIGIT (tmpa))
1150             return -1;
1151           while (tmpb == NUMERIC_ZERO || tmpb == thousands_sep)
1152             tmpb = *++b;
1153           if (tmpb == decimal_point)
1154             do
1155               tmpb = *++b;
1156             while (tmpb == NUMERIC_ZERO);
1157           return - ISDIGIT (tmpb);
1158         }
1159       do
1160         tmpb = *++b;
1161       while (tmpb == NUMERIC_ZERO || tmpb == thousands_sep);
1162
1163       while (tmpa == tmpb && ISDIGIT (tmpa))
1164         {
1165           do
1166             tmpa = *++a;
1167           while (tmpa == thousands_sep);
1168           do
1169             tmpb = *++b;
1170           while (tmpb == thousands_sep);
1171         }
1172
1173       if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1174           || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1175         return fraccompare (b, a);
1176
1177       tmp = tmpb - tmpa;
1178
1179       for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1180         do
1181           tmpa = *++a;
1182         while (tmpa == thousands_sep);
1183
1184       for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1185         do
1186           tmpb = *++b;
1187         while (tmpb == thousands_sep);
1188
1189       if (log_a != log_b)
1190         return log_a < log_b ? 1 : -1;
1191
1192       if (!log_a)
1193         return 0;
1194
1195       return tmp;
1196     }
1197   else if (tmpb == NEGATION_SIGN)
1198     {
1199       do
1200         tmpb = *++b;
1201       while (tmpb == NUMERIC_ZERO || tmpb == thousands_sep);
1202       if (tmpb == decimal_point)
1203         do
1204           tmpb = *++b;
1205         while (tmpb == NUMERIC_ZERO);
1206       if (ISDIGIT (tmpb))
1207         return 1;
1208       while (tmpa == NUMERIC_ZERO || tmpa == thousands_sep)
1209         tmpa = *++a;
1210       if (tmpa == decimal_point)
1211         do
1212           tmpa = *++a;
1213         while (tmpa == NUMERIC_ZERO);
1214       return ISDIGIT (tmpa);
1215     }
1216   else
1217     {
1218       while (tmpa == NUMERIC_ZERO || tmpa == thousands_sep)
1219         tmpa = *++a;
1220       while (tmpb == NUMERIC_ZERO || tmpb == thousands_sep)
1221         tmpb = *++b;
1222
1223       while (tmpa == tmpb && ISDIGIT (tmpa))
1224         {
1225           do
1226             tmpa = *++a;
1227           while (tmpa == thousands_sep);
1228           do
1229             tmpb = *++b;
1230           while (tmpb == thousands_sep);
1231         }
1232
1233       if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1234           || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1235         return fraccompare (a, b);
1236
1237       tmp = tmpa - tmpb;
1238
1239       for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1240         do
1241           tmpa = *++a;
1242         while (tmpa == thousands_sep);
1243
1244       for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1245         do
1246           tmpb = *++b;
1247         while (tmpb == thousands_sep);
1248
1249       if (log_a != log_b)
1250         return log_a < log_b ? -1 : 1;
1251
1252       if (!log_a)
1253         return 0;
1254
1255       return tmp;
1256     }
1257 }
1258
1259 static int
1260 general_numcompare (const char *sa, const char *sb)
1261 {
1262   /* FIXME: add option to warn about failed conversions.  */
1263   /* FIXME: maybe add option to try expensive FP conversion
1264      only if A and B can't be compared more cheaply/accurately.  */
1265
1266   char *ea;
1267   char *eb;
1268   double a = strtod (sa, &ea);
1269   double b = strtod (sb, &eb);
1270
1271   /* Put conversion errors at the start of the collating sequence.  */
1272   if (sa == ea)
1273     return sb == eb ? 0 : -1;
1274   if (sb == eb)
1275     return 1;
1276
1277   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1278      conversion errors but before numbers; sort them by internal
1279      bit-pattern, for lack of a more portable alternative.  */
1280   return (a < b ? -1
1281           : a > b ? 1
1282           : a == b ? 0
1283           : b == b ? -1
1284           : a == a ? 1
1285           : memcmp ((char *) &a, (char *) &b, sizeof a));
1286 }
1287
1288 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1289    Return 0 if the name in S is not recognized.  */
1290
1291 static int
1292 getmonth (char const *month, size_t len)
1293 {
1294   size_t lo = 0;
1295   size_t hi = MONTHS_PER_YEAR;
1296   char const *monthlim = month + len;
1297
1298   for (;;)
1299     {
1300       if (month == monthlim)
1301         return 0;
1302       if (!blanks[to_uchar (*month)])
1303         break;
1304       ++month;
1305     }
1306
1307   do
1308     {
1309       size_t ix = (lo + hi) / 2;
1310       char const *m = month;
1311       char const *n = monthtab[ix].name;
1312
1313       for (;; m++, n++)
1314         {
1315           if (!*n)
1316             return monthtab[ix].val;
1317           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1318             {
1319               hi = ix;
1320               break;
1321             }
1322           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1323             {
1324               lo = ix + 1;
1325               break;
1326             }
1327         }
1328     }
1329   while (lo < hi);
1330
1331   return 0;
1332 }
1333
1334 /* Compare two lines A and B trying every key in sequence until there
1335    are no more keys or a difference is found. */
1336
1337 static int
1338 keycompare (const struct line *a, const struct line *b)
1339 {
1340   struct keyfield const *key = keylist;
1341
1342   /* For the first iteration only, the key positions have been
1343      precomputed for us. */
1344   register char *texta = a->keybeg;
1345   register char *textb = b->keybeg;
1346   register char *lima = a->keylim;
1347   register char *limb = b->keylim;
1348
1349   int diff;
1350
1351   for (;;)
1352     {
1353       register char const *translate = key->translate;
1354       register bool const *ignore = key->ignore;
1355
1356       /* Find the lengths. */
1357       size_t lena = lima <= texta ? 0 : lima - texta;
1358       size_t lenb = limb <= textb ? 0 : limb - textb;
1359
1360       /* Actually compare the fields. */
1361       if (key->numeric | key->general_numeric)
1362         {
1363           char savea = *lima, saveb = *limb;
1364
1365           *lima = *limb = '\0';
1366           diff = ((key->numeric ? numcompare : general_numcompare)
1367                   (texta, textb));
1368           *lima = savea, *limb = saveb;
1369         }
1370       else if (key->month)
1371         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1372       /* Sorting like this may become slow, so in a simple locale the user
1373          can select a faster sort that is similar to ascii sort.  */
1374       else if (hard_LC_COLLATE)
1375         {
1376           if (ignore || translate)
1377             {
1378               char buf[4000];
1379               size_t size = lena + 1 + lenb + 1;
1380               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1381               char *copy_b = copy_a + lena + 1;
1382               size_t new_len_a, new_len_b, i;
1383
1384               /* Ignore and/or translate chars before comparing.  */
1385               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1386                 {
1387                   if (i < lena)
1388                     {
1389                       copy_a[new_len_a] = (translate
1390                                            ? translate[to_uchar (texta[i])]
1391                                            : texta[i]);
1392                       if (!ignore || !ignore[to_uchar (texta[i])])
1393                         ++new_len_a;
1394                     }
1395                   if (i < lenb)
1396                     {
1397                       copy_b[new_len_b] = (translate
1398                                            ? translate[to_uchar (textb[i])]
1399                                            : textb [i]);
1400                       if (!ignore || !ignore[to_uchar (textb[i])])
1401                         ++new_len_b;
1402                     }
1403                 }
1404
1405               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1406
1407               if (sizeof buf < size)
1408                 free (copy_a);
1409             }
1410           else if (lena == 0)
1411             diff = - NONZERO (lenb);
1412           else if (lenb == 0)
1413             goto greater;
1414           else
1415             diff = xmemcoll (texta, lena, textb, lenb);
1416         }
1417       else if (ignore)
1418         {
1419 #define CMP_WITH_IGNORE(A, B)                                           \
1420   do                                                                    \
1421     {                                                                   \
1422           for (;;)                                                      \
1423             {                                                           \
1424               while (texta < lima && ignore[to_uchar (*texta)])         \
1425                 ++texta;                                                \
1426               while (textb < limb && ignore[to_uchar (*textb)])         \
1427                 ++textb;                                                \
1428               if (! (texta < lima && textb < limb))                     \
1429                 break;                                                  \
1430               diff = to_uchar (A) - to_uchar (B);                       \
1431               if (diff)                                                 \
1432                 goto not_equal;                                         \
1433               ++texta;                                                  \
1434               ++textb;                                                  \
1435             }                                                           \
1436                                                                         \
1437           diff = (texta < lima) - (textb < limb);                       \
1438     }                                                                   \
1439   while (0)
1440
1441           if (translate)
1442             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1443                              translate[to_uchar (*textb)]);
1444           else
1445             CMP_WITH_IGNORE (*texta, *textb);
1446         }
1447       else if (lena == 0)
1448         diff = - NONZERO (lenb);
1449       else if (lenb == 0)
1450         goto greater;
1451       else
1452         {
1453           if (translate)
1454             {
1455               while (texta < lima && textb < limb)
1456                 {
1457                   diff = (to_uchar (translate[to_uchar (*texta++)])
1458                           - to_uchar (translate[to_uchar (*textb++)]));
1459                   if (diff)
1460                     goto not_equal;
1461                 }
1462             }
1463           else
1464             {
1465               diff = memcmp (texta, textb, MIN (lena, lenb));
1466               if (diff)
1467                 goto not_equal;
1468             }
1469           diff = lena < lenb ? -1 : lena != lenb;
1470         }
1471
1472       if (diff)
1473         goto not_equal;
1474
1475       key = key->next;
1476       if (! key)
1477         break;
1478
1479       /* Find the beginning and limit of the next field.  */
1480       if (key->eword != SIZE_MAX)
1481         lima = limfield (a, key), limb = limfield (b, key);
1482       else
1483         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1484
1485       if (key->sword != SIZE_MAX)
1486         texta = begfield (a, key), textb = begfield (b, key);
1487       else
1488         {
1489           texta = a->text, textb = b->text;
1490           if (key->skipsblanks)
1491             {
1492               while (texta < lima && blanks[to_uchar (*texta)])
1493                 ++texta;
1494               while (textb < limb && blanks[to_uchar (*textb)])
1495                 ++textb;
1496             }
1497         }
1498     }
1499
1500   return 0;
1501
1502  greater:
1503   diff = 1;
1504  not_equal:
1505   return key->reverse ? -diff : diff;
1506 }
1507
1508 /* Compare two lines A and B, returning negative, zero, or positive
1509    depending on whether A compares less than, equal to, or greater than B. */
1510
1511 static int
1512 compare (register const struct line *a, register const struct line *b)
1513 {
1514   int diff;
1515   size_t alen, blen;
1516
1517   /* First try to compare on the specified keys (if any).
1518      The only two cases with no key at all are unadorned sort,
1519      and unadorned sort -r. */
1520   if (keylist)
1521     {
1522       diff = keycompare (a, b);
1523       if (diff | unique | stable)
1524         return diff;
1525     }
1526
1527   /* If the keys all compare equal (or no keys were specified)
1528      fall through to the default comparison.  */
1529   alen = a->length - 1, blen = b->length - 1;
1530
1531   if (alen == 0)
1532     diff = - NONZERO (blen);
1533   else if (blen == 0)
1534     diff = 1;
1535   else if (hard_LC_COLLATE)
1536     diff = xmemcoll (a->text, alen, b->text, blen);
1537   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1538     diff = alen < blen ? -1 : alen != blen;
1539
1540   return reverse ? -diff : diff;
1541 }
1542
1543 /* Check that the lines read from FILE_NAME come in order.  Print a
1544    diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1545    false if they are not in order.  Otherwise, print no diagnostic
1546    and return true.  */
1547
1548 static bool
1549 check (char const *file_name)
1550 {
1551   FILE *fp = xfopen (file_name, "r");
1552   struct buffer buf;            /* Input buffer. */
1553   struct line temp;             /* Copy of previous line. */
1554   size_t alloc = 0;
1555   uintmax_t line_number = 0;
1556   struct keyfield const *key = keylist;
1557   bool nonunique = ! unique;
1558   bool ordered = true;
1559
1560   initbuf (&buf, sizeof (struct line),
1561            MAX (merge_buffer_size, sort_size));
1562   temp.text = NULL;
1563
1564   while (fillbuf (&buf, fp, file_name))
1565     {
1566       struct line const *line = buffer_linelim (&buf);
1567       struct line const *linebase = line - buf.nlines;
1568
1569       /* Make sure the line saved from the old buffer contents is
1570          less than or equal to the first line of the new buffer. */
1571       if (alloc && nonunique <= compare (&temp, line - 1))
1572         {
1573         found_disorder:
1574           {
1575             struct line const *disorder_line = line - 1;
1576             uintmax_t disorder_line_number =
1577               buffer_linelim (&buf) - disorder_line + line_number;
1578             char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1579             fprintf (stderr, _("%s: %s:%s: disorder: "),
1580                      program_name, file_name,
1581                      umaxtostr (disorder_line_number, hr_buf));
1582             write_bytes (disorder_line->text, disorder_line->length, stderr,
1583                          _("standard error"));
1584             ordered = false;
1585             break;
1586           }
1587         }
1588
1589       /* Compare each line in the buffer with its successor.  */
1590       while (linebase < --line)
1591         if (nonunique <= compare (line, line - 1))
1592           goto found_disorder;
1593
1594       line_number += buf.nlines;
1595
1596       /* Save the last line of the buffer.  */
1597       if (alloc < line->length)
1598         {
1599           do
1600             {
1601               alloc *= 2;
1602               if (! alloc)
1603                 {
1604                   alloc = line->length;
1605                   break;
1606                 }
1607             }
1608           while (alloc < line->length);
1609
1610           temp.text = xrealloc (temp.text, alloc);
1611         }
1612       memcpy (temp.text, line->text, line->length);
1613       temp.length = line->length;
1614       if (key)
1615         {
1616           temp.keybeg = temp.text + (line->keybeg - line->text);
1617           temp.keylim = temp.text + (line->keylim - line->text);
1618         }
1619     }
1620
1621   xfclose (fp, file_name);
1622   free (buf.buf);
1623   if (temp.text)
1624     free (temp.text);
1625   return ordered;
1626 }
1627
1628 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
1629    files (all of which are at the start of the FILES array), and
1630    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1631    Close input and output files before returning.
1632    OUTPUT_FILE gives the name of the output file.  If it is NULL,
1633    the output file is standard output.  If OFP is NULL, the output
1634    file has not been opened yet (or written to, if standard output).  */
1635
1636 static void
1637 mergefps (char **files, size_t ntemps, size_t nfiles,
1638           FILE *ofp, char const *output_file)
1639 {
1640   FILE *fps[NMERGE];            /* Input streams for each file.  */
1641   struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1642   struct line saved;            /* Saved line storage for unique check. */
1643   struct line const *savedline = NULL;
1644                                 /* &saved if there is a saved line. */
1645   size_t savealloc = 0;         /* Size allocated for the saved line. */
1646   struct line const *cur[NMERGE]; /* Current line in each line table. */
1647   struct line const *base[NMERGE]; /* Base of each line table.  */
1648   size_t ord[NMERGE];           /* Table representing a permutation of fps,
1649                                    such that cur[ord[0]] is the smallest line
1650                                    and will be next output. */
1651   size_t i;
1652   size_t j;
1653   size_t t;
1654   struct keyfield const *key = keylist;
1655   saved.text = NULL;
1656
1657   /* Read initial lines from each input file. */
1658   for (i = 0; i < nfiles; )
1659     {
1660       fps[i] = xfopen (files[i], "r");
1661       initbuf (&buffer[i], sizeof (struct line),
1662                MAX (merge_buffer_size, sort_size / nfiles));
1663       if (fillbuf (&buffer[i], fps[i], files[i]))
1664         {
1665           struct line const *linelim = buffer_linelim (&buffer[i]);
1666           cur[i] = linelim - 1;
1667           base[i] = linelim - buffer[i].nlines;
1668           i++;
1669         }
1670       else
1671         {
1672           /* fps[i] is empty; eliminate it from future consideration.  */
1673           xfclose (fps[i], files[i]);
1674           if (i < ntemps)
1675             {
1676               ntemps--;
1677               zaptemp (files[i]);
1678             }
1679           free (buffer[i].buf);
1680           --nfiles;
1681           for (j = i; j < nfiles; ++j)
1682             files[j] = files[j + 1];
1683         }
1684     }
1685
1686   if (! ofp)
1687     ofp = xfopen (output_file, "w");
1688
1689   /* Set up the ord table according to comparisons among input lines.
1690      Since this only reorders two items if one is strictly greater than
1691      the other, it is stable. */
1692   for (i = 0; i < nfiles; ++i)
1693     ord[i] = i;
1694   for (i = 1; i < nfiles; ++i)
1695     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1696       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1697
1698   /* Repeatedly output the smallest line until no input remains. */
1699   while (nfiles)
1700     {
1701       struct line const *smallest = cur[ord[0]];
1702
1703       /* If uniquified output is turned on, output only the first of
1704          an identical series of lines. */
1705       if (unique)
1706         {
1707           if (savedline && compare (savedline, smallest))
1708             {
1709               savedline = 0;
1710               write_bytes (saved.text, saved.length, ofp, output_file);
1711             }
1712           if (!savedline)
1713             {
1714               savedline = &saved;
1715               if (savealloc < smallest->length)
1716                 {
1717                   do
1718                     if (! savealloc)
1719                       {
1720                         savealloc = smallest->length;
1721                         break;
1722                       }
1723                   while ((savealloc *= 2) < smallest->length);
1724
1725                   saved.text = xrealloc (saved.text, savealloc);
1726                 }
1727               saved.length = smallest->length;
1728               memcpy (saved.text, smallest->text, saved.length);
1729               if (key)
1730                 {
1731                   saved.keybeg =
1732                     saved.text + (smallest->keybeg - smallest->text);
1733                   saved.keylim =
1734                     saved.text + (smallest->keylim - smallest->text);
1735                 }
1736             }
1737         }
1738       else
1739         write_bytes (smallest->text, smallest->length, ofp, output_file);
1740
1741       /* Check if we need to read more lines into core. */
1742       if (base[ord[0]] < smallest)
1743         cur[ord[0]] = smallest - 1;
1744       else
1745         {
1746           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1747             {
1748               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1749               cur[ord[0]] = linelim - 1;
1750               base[ord[0]] = linelim - buffer[ord[0]].nlines;
1751             }
1752           else
1753             {
1754               /* We reached EOF on fps[ord[0]].  */
1755               for (i = 1; i < nfiles; ++i)
1756                 if (ord[i] > ord[0])
1757                   --ord[i];
1758               --nfiles;
1759               xfclose (fps[ord[0]], files[ord[0]]);
1760               if (ord[0] < ntemps)
1761                 {
1762                   ntemps--;
1763                   zaptemp (files[ord[0]]);
1764                 }
1765               free (buffer[ord[0]].buf);
1766               for (i = ord[0]; i < nfiles; ++i)
1767                 {
1768                   fps[i] = fps[i + 1];
1769                   files[i] = files[i + 1];
1770                   buffer[i] = buffer[i + 1];
1771                   cur[i] = cur[i + 1];
1772                   base[i] = base[i + 1];
1773                 }
1774               for (i = 0; i < nfiles; ++i)
1775                 ord[i] = ord[i + 1];
1776               continue;
1777             }
1778         }
1779
1780       /* The new line just read in may be larger than other lines
1781          already in core; push it back in the queue until we encounter
1782          a line larger than it. */
1783       for (i = 1; i < nfiles; ++i)
1784         {
1785           int cmp = compare (cur[ord[0]], cur[ord[i]]);
1786           if (cmp < 0 || (cmp == 0 && ord[0] < ord[i]))
1787             break;
1788         }
1789       t = ord[0];
1790       for (j = 1; j < i; ++j)
1791         ord[j - 1] = ord[j];
1792       ord[i - 1] = t;
1793     }
1794
1795   if (unique && savedline)
1796     {
1797       write_bytes (saved.text, saved.length, ofp, output_file);
1798       free (saved.text);
1799     }
1800
1801   xfclose (ofp, output_file);
1802 }
1803
1804 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1805    and HI (with NHI members).  T, LO, and HI point just past their
1806    respective arrays, and the arrays are in reverse order.  NLO and
1807    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
1808
1809 static inline void
1810 mergelines (struct line *t,
1811             struct line const *lo, size_t nlo,
1812             struct line const *hi, size_t nhi)
1813 {
1814   for (;;)
1815     if (compare (lo - 1, hi - 1) <= 0)
1816       {
1817         *--t = *--lo;
1818         if (! --nlo)
1819           {
1820             /* HI - NHI equalled T - (NLO + NHI) when this function
1821                began.  Therefore HI must equal T now, and there is no
1822                need to copy from HI to T.  */
1823             return;
1824           }
1825       }
1826     else
1827       {
1828         *--t = *--hi;
1829         if (! --nhi)
1830           {
1831             do
1832               *--t = *--lo;
1833             while (--nlo);
1834
1835             return;
1836           }
1837       }
1838 }
1839
1840 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1841    NLINES must be at least 2.
1842    The input and output arrays are in reverse order, and LINES and
1843    TEMP point just past the end of their respective arrays.
1844
1845    Use a recursive divide-and-conquer algorithm, in the style
1846    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
1847    the optimization suggested by exercise 5.2.4-10; this requires room
1848    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
1849    writes that this memory optimization was originally published by
1850    D. A. Bell, Comp J. 1 (1958), 75.  */
1851
1852 static void
1853 sortlines (struct line *lines, size_t nlines, struct line *temp)
1854 {
1855   if (nlines == 2)
1856     {
1857       if (0 < compare (&lines[-1], &lines[-2]))
1858         {
1859           struct line tmp = lines[-1];
1860           lines[-1] = lines[-2];
1861           lines[-2] = tmp;
1862         }
1863     }
1864   else
1865     {
1866       size_t nlo = nlines / 2;
1867       size_t nhi = nlines - nlo;
1868       struct line *lo = lines;
1869       struct line *hi = lines - nlo;
1870       struct line *sorted_lo = temp;
1871
1872       sortlines (hi, nhi, temp);
1873       if (1 < nlo)
1874         sortlines_temp (lo, nlo, sorted_lo);
1875       else
1876         sorted_lo[-1] = lo[-1];
1877
1878       mergelines (lines, sorted_lo, nlo, hi, nhi);
1879     }
1880 }
1881
1882 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1883    rather than sorting in place.  */
1884
1885 static void
1886 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1887 {
1888   if (nlines == 2)
1889     {
1890       bool swap = (0 < compare (&lines[-1], &lines[-2]));
1891       temp[-1] = lines[-1 - swap];
1892       temp[-2] = lines[-2 + swap];
1893     }
1894   else
1895     {
1896       size_t nlo = nlines / 2;
1897       size_t nhi = nlines - nlo;
1898       struct line *lo = lines;
1899       struct line *hi = lines - nlo;
1900       struct line *sorted_hi = temp - nlo;
1901
1902       sortlines_temp (hi, nhi, sorted_hi);
1903       if (1 < nlo)
1904         sortlines (lo, nlo, temp);
1905
1906       mergelines (temp, lo, nlo, sorted_hi, nhi);
1907     }
1908 }
1909
1910 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1911    the same as OUTFILE.  If found, merge the found instances (and perhaps
1912    some other files) into a temporary file so that it can in turn be
1913    merged into OUTFILE without destroying OUTFILE before it is completely
1914    read.  Return the new value of NFILES, which differs from the old if
1915    some merging occurred.
1916
1917    This test ensures that an otherwise-erroneous use like
1918    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1919    It's not clear that POSIX requires this nicety.
1920    Detect common error cases, but don't try to catch obscure cases like
1921    "cat ... FILE ... | sort -m -o FILE"
1922    where traditional "sort" doesn't copy the input and where
1923    people should know that they're getting into trouble anyway.
1924    Catching these obscure cases would slow down performance in
1925    common cases.  */
1926
1927 static size_t
1928 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1929                       char const *outfile)
1930 {
1931   size_t i;
1932   bool got_outstat = false;
1933   struct stat outstat;
1934
1935   for (i = ntemps; i < nfiles; i++)
1936     {
1937       bool standard_input = STREQ (files[i], "-");
1938       bool same;
1939       struct stat instat;
1940
1941       if (outfile && STREQ (outfile, files[i]) && ! standard_input)
1942         same = true;
1943       else
1944         {
1945           if (! got_outstat)
1946             {
1947               if ((outfile
1948                    ? stat (outfile, &outstat)
1949                    : fstat (STDOUT_FILENO, &outstat))
1950                   != 0)
1951                 break;
1952               got_outstat = true;
1953             }
1954
1955           same = (((standard_input
1956                     ? fstat (STDIN_FILENO, &instat)
1957                     : stat (files[i], &instat))
1958                    == 0)
1959                   && SAME_INODE (instat, outstat));
1960         }
1961
1962       if (same)
1963         {
1964           FILE *tftp;
1965           char *temp = create_temp_file (&tftp);
1966           mergefps (&files[i], 0, nfiles - i, tftp, temp);
1967           files[i] = temp;
1968           return i + 1;
1969         }
1970     }
1971
1972   return nfiles;
1973 }
1974
1975 /* Merge the input FILES.  NTEMPS is the number of files at the
1976    start of FILES that are temporary; it is zero at the top level.
1977    NFILES is the total number of files.  Put the output in
1978    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
1979
1980 static void
1981 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1982 {
1983   while (NMERGE < nfiles)
1984     {
1985       /* Number of input files processed so far.  */
1986       size_t in;
1987
1988       /* Number of output files generated so far.  */
1989       size_t out;
1990
1991       /* nfiles % NMERGE; this counts input files that are left over
1992          after all full-sized merges have been done.  */
1993       size_t remainder;
1994
1995       /* Number of easily-available slots at the next loop iteration.  */
1996       size_t cheap_slots;
1997
1998       /* Do as many NMERGE-size merges as possible.  */
1999       for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
2000         {
2001           FILE *tfp;
2002           char *temp = create_temp_file (&tfp);
2003           size_t nt = MIN (ntemps, NMERGE);
2004           ntemps -= nt;
2005           mergefps (&files[in], nt, NMERGE, tfp, temp);
2006           files[out] = temp;
2007         }
2008
2009       remainder = nfiles - in;
2010       cheap_slots = NMERGE - out % NMERGE;
2011
2012       if (cheap_slots < remainder)
2013         {
2014           /* So many files remain that they can't all be put into the last
2015              NMERGE-sized output window.  Do one more merge.  Merge as few
2016              files as possible, to avoid needless I/O.  */
2017           size_t nshortmerge = remainder - cheap_slots + 1;
2018           FILE *tfp;
2019           char *temp = create_temp_file (&tfp);
2020           size_t nt = MIN (ntemps, nshortmerge);
2021           ntemps -= nt;
2022           mergefps (&files[in], nt, nshortmerge, tfp, temp);
2023           files[out++] = temp;
2024           in += nshortmerge;
2025         }
2026
2027       /* Put the remaining input files into the last NMERGE-sized output
2028          window, so they will be merged in the next pass.  */
2029       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2030       ntemps += out;
2031       nfiles -= in - out;
2032     }
2033
2034   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2035   mergefps (files, ntemps, nfiles, NULL, output_file);
2036 }
2037
2038 /* Sort NFILES FILES onto OUTPUT_FILE. */
2039
2040 static void
2041 sort (char * const *files, size_t nfiles, char const *output_file)
2042 {
2043   struct buffer buf;
2044   size_t ntemps = 0;
2045   bool output_file_created = false;
2046
2047   buf.alloc = 0;
2048
2049   while (nfiles)
2050     {
2051       char const *temp_output;
2052       char const *file = *files;
2053       FILE *fp = xfopen (file, "r");
2054       FILE *tfp;
2055       size_t bytes_per_line = (2 * sizeof (struct line)
2056                                - sizeof (struct line) / 2);
2057
2058       if (! buf.alloc)
2059         initbuf (&buf, bytes_per_line,
2060                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2061       buf.eof = false;
2062       files++;
2063       nfiles--;
2064
2065       while (fillbuf (&buf, fp, file))
2066         {
2067           struct line *line;
2068           struct line *linebase;
2069
2070           if (buf.eof && nfiles
2071               && (bytes_per_line + 1
2072                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2073             {
2074               /* End of file, but there is more input and buffer room.
2075                  Concatenate the next input file; this is faster in
2076                  the usual case.  */
2077               buf.left = buf.used;
2078               break;
2079             }
2080
2081           line = buffer_linelim (&buf);
2082           linebase = line - buf.nlines;
2083           if (1 < buf.nlines)
2084             sortlines (line, buf.nlines, linebase);
2085           if (buf.eof && !nfiles && !ntemps && !buf.left)
2086             {
2087               xfclose (fp, file);
2088               tfp = xfopen (output_file, "w");
2089               temp_output = output_file;
2090               output_file_created = true;
2091             }
2092           else
2093             {
2094               ++ntemps;
2095               temp_output = create_temp_file (&tfp);
2096             }
2097
2098           do
2099             {
2100               line--;
2101               write_bytes (line->text, line->length, tfp, temp_output);
2102               if (unique)
2103                 while (linebase < line && compare (line, line - 1) == 0)
2104                   line--;
2105             }
2106           while (linebase < line);
2107
2108           xfclose (tfp, temp_output);
2109
2110           if (output_file_created)
2111             goto finish;
2112         }
2113       xfclose (fp, file);
2114     }
2115
2116  finish:
2117   free (buf.buf);
2118
2119   if (! output_file_created)
2120     {
2121       size_t i;
2122       struct tempnode *node = temphead;
2123       char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2124       for (i = 0; node; i++)
2125         {
2126           tempfiles[i] = node->name;
2127           node = node->next;
2128         }
2129       merge (tempfiles, ntemps, ntemps, output_file);
2130       free (tempfiles);
2131     }
2132 }
2133
2134 /* Insert key KEY at the end of the key list.  */
2135
2136 static void
2137 insertkey (struct keyfield *key)
2138 {
2139   struct keyfield **p;
2140
2141   for (p = &keylist; *p; p = &(*p)->next)
2142     continue;
2143   *p = key;
2144   key->next = NULL;
2145 }
2146
2147 /* Report a bad field specification SPEC, with extra info MSGID.  */
2148
2149 static void badfieldspec (char const *, char const *)
2150      ATTRIBUTE_NORETURN;
2151 static void
2152 badfieldspec (char const *spec, char const *msgid)
2153 {
2154   error (SORT_FAILURE, 0, _("%s: invalid field specification `%s'"),
2155          _(msgid), spec);
2156   abort ();
2157 }
2158
2159 /* Parse the leading integer in STRING and store the resulting value
2160    (which must fit into size_t) into *VAL.  Return the address of the
2161    suffix after the integer.  If MSGID is NULL, return NULL after
2162    failure; otherwise, report MSGID and exit on failure.  */
2163
2164 static char const *
2165 parse_field_count (char const *string, size_t *val, char const *msgid)
2166 {
2167   char *suffix;
2168   uintmax_t n;
2169
2170   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2171     {
2172     case LONGINT_OK:
2173     case LONGINT_INVALID_SUFFIX_CHAR:
2174       *val = n;
2175       if (*val == n)
2176         break;
2177       /* Fall through.  */
2178     case LONGINT_OVERFLOW:
2179     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2180       if (msgid)
2181         error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2182                _(msgid), (int) (suffix - string), string);
2183       return NULL;
2184
2185     case LONGINT_INVALID:
2186       if (msgid)
2187         error (SORT_FAILURE, 0, _("%s: invalid count at start of `%s'"),
2188                _(msgid), string);
2189       return NULL;
2190     }
2191
2192   return suffix;
2193 }
2194
2195 /* Handle interrupts and hangups. */
2196
2197 static void
2198 sighandler (int sig)
2199 {
2200 #ifndef SA_NOCLDSTOP
2201   signal (sig, SIG_IGN);
2202 #endif
2203
2204   cleanup ();
2205
2206   signal (sig, SIG_DFL);
2207   raise (sig);
2208 }
2209
2210 /* Set the ordering options for KEY specified in S.
2211    Return the address of the first character in S that
2212    is not a valid ordering option.
2213    BLANKTYPE is the kind of blanks that 'b' should skip. */
2214
2215 static char *
2216 set_ordering (register const char *s, struct keyfield *key,
2217               enum blanktype blanktype)
2218 {
2219   while (*s)
2220     {
2221       switch (*s)
2222         {
2223         case 'b':
2224           if (blanktype == bl_start || blanktype == bl_both)
2225             key->skipsblanks = true;
2226           if (blanktype == bl_end || blanktype == bl_both)
2227             key->skipeblanks = true;
2228           break;
2229         case 'd':
2230           key->ignore = nondictionary;
2231           break;
2232         case 'f':
2233           key->translate = fold_toupper;
2234           break;
2235         case 'g':
2236           key->general_numeric = true;
2237           break;
2238         case 'i':
2239           /* Option order should not matter, so don't let -i override
2240              -d.  -d implies -i, but -i does not imply -d.  */
2241           if (! key->ignore)
2242             key->ignore = nonprinting;
2243           break;
2244         case 'M':
2245           key->month = true;
2246           break;
2247         case 'n':
2248           key->numeric = true;
2249           break;
2250         case 'r':
2251           key->reverse = true;
2252           break;
2253         default:
2254           return (char *) s;
2255         }
2256       ++s;
2257     }
2258   return (char *) s;
2259 }
2260
2261 static struct keyfield *
2262 new_key (void)
2263 {
2264   struct keyfield *key = xzalloc (sizeof *key);
2265   key->eword = SIZE_MAX;
2266   return key;
2267 }
2268
2269 int
2270 main (int argc, char **argv)
2271 {
2272   struct keyfield *key;
2273   struct keyfield gkey;
2274   char const *s;
2275   int c = 0;
2276   bool checkonly = false;
2277   bool mergeonly = false;
2278   size_t nfiles = 0;
2279   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2280   bool obsolete_usage = (posix2_version () < 200112);
2281   char const *short_options = (obsolete_usage
2282                                ? COMMON_SHORT_OPTIONS "y::"
2283                                : COMMON_SHORT_OPTIONS "y:");
2284   char *minus = "-", **files;
2285   char const *outfile = NULL;
2286
2287   initialize_main (&argc, &argv);
2288   program_name = argv[0];
2289   setlocale (LC_ALL, "");
2290   bindtextdomain (PACKAGE, LOCALEDIR);
2291   textdomain (PACKAGE);
2292
2293   atexit (cleanup);
2294
2295   initialize_exit_failure (SORT_FAILURE);
2296   atexit (close_stdout);
2297
2298   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2299 #if HAVE_NL_LANGINFO
2300   hard_LC_TIME = hard_locale (LC_TIME);
2301 #endif
2302
2303   /* Get locale's representation of the decimal point.  */
2304   {
2305     struct lconv const *locale = localeconv ();
2306
2307     /* If the locale doesn't define a decimal point, or if the decimal
2308        point is multibyte, use the C locale's decimal point.  FIXME:
2309        add support for multibyte decimal points.  */
2310     decimal_point = locale->decimal_point[0];
2311     if (! decimal_point || locale->decimal_point[1])
2312       decimal_point = '.';
2313
2314     /* FIXME: add support for multibyte thousands separators.  */
2315     thousands_sep = *locale->thousands_sep;
2316     if (! thousands_sep || locale->thousands_sep[1])
2317       thousands_sep = CHAR_MAX + 1;
2318   }
2319
2320   have_read_stdin = false;
2321   inittables ();
2322
2323   {
2324     size_t i;
2325     static int const sig[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2326     enum { nsigs = sizeof sig / sizeof sig[0] };
2327
2328 #ifdef SA_NOCLDSTOP
2329     struct sigaction act;
2330
2331     sigemptyset (&caught_signals);
2332     for (i = 0; i < nsigs; i++)
2333       {
2334         sigaction (sig[i], NULL, &act);
2335         if (act.sa_handler != SIG_IGN)
2336           sigaddset (&caught_signals, sig[i]);
2337       }
2338
2339     act.sa_handler = sighandler;
2340     act.sa_mask = caught_signals;
2341     act.sa_flags = 0;
2342
2343     for (i = 0; i < nsigs; i++)
2344       if (sigismember (&caught_signals, sig[i]))
2345         sigaction (sig[i], &act, NULL);
2346 #else
2347     for (i = 0; i < nsigs; i++)
2348       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2349         signal (sig[i], sighandler);
2350 #endif
2351   }
2352
2353   gkey.sword = gkey.eword = SIZE_MAX;
2354   gkey.ignore = NULL;
2355   gkey.translate = NULL;
2356   gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
2357   gkey.skipsblanks = gkey.skipeblanks = false;
2358
2359   files = xnmalloc (argc, sizeof *files);
2360
2361   for (;;)
2362     {
2363       /* Parse an operand as a file after "--" was seen; or if
2364          pedantic and a file was seen, unless the POSIX version
2365          predates 1003.1-2001 and -c was not seen and the operand is
2366          "-o FILE" or "-oFILE".  */
2367
2368       if (c == -1
2369           || (posixly_correct && nfiles != 0
2370               && ! (obsolete_usage
2371                     && ! checkonly
2372                     && optind != argc
2373                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
2374                     && (argv[optind][2] || optind + 1 != argc)))
2375           || ((c = getopt_long (argc, argv, short_options,
2376                                 long_options, NULL))
2377               == -1))
2378         {
2379           if (argc <= optind)
2380             break;
2381           files[nfiles++] = argv[optind++];
2382         }
2383       else switch (c)
2384         {
2385         case 1:
2386           key = NULL;
2387           if (obsolete_usage && optarg[0] == '+')
2388             {
2389               /* Treat +POS1 [-POS2] as a key if possible; but silently
2390                  treat an operand as a file if it is not a valid +POS1.  */
2391               key = new_key ();
2392               s = parse_field_count (optarg + 1, &key->sword, NULL);
2393               if (s && *s == '.')
2394                 s = parse_field_count (s + 1, &key->schar, NULL);
2395               if (! (key->sword | key->schar))
2396                 key->sword = SIZE_MAX;
2397               if (! s || *set_ordering (s, key, bl_start))
2398                 {
2399                   free (key);
2400                   key = NULL;
2401                 }
2402               else
2403                 {
2404                   if (optind != argc && argv[optind][0] == '-'
2405                       && ISDIGIT (argv[optind][1]))
2406                     {
2407                       char const *optarg1 = argv[optind++];
2408                       s = parse_field_count (optarg1 + 1, &key->eword,
2409                                              N_("invalid number after `-'"));
2410                       if (*s == '.')
2411                         s = parse_field_count (s + 1, &key->echar,
2412                                                N_("invalid number after `.'"));
2413                       if (*set_ordering (s, key, bl_end))
2414                         badfieldspec (optarg1,
2415                                       N_("stray character in field spec"));
2416                     }
2417                   insertkey (key);
2418                 }
2419             }
2420           if (! key)
2421             files[nfiles++] = optarg;
2422           break;
2423
2424         case 'b':
2425         case 'd':
2426         case 'f':
2427         case 'g':
2428         case 'i':
2429         case 'M':
2430         case 'n':
2431         case 'r':
2432           {
2433             char str[2];
2434             str[0] = c;
2435             str[1] = '\0';
2436             set_ordering (str, &gkey, bl_both);
2437           }
2438           break;
2439
2440         case 'c':
2441           checkonly = true;
2442           break;
2443
2444         case 'k':
2445           key = new_key ();
2446
2447           /* Get POS1. */
2448           s = parse_field_count (optarg, &key->sword,
2449                                  N_("invalid number at field start"));
2450           if (! key->sword--)
2451             {
2452               /* Provoke with `sort -k0' */
2453               badfieldspec (optarg, N_("field number is zero"));
2454             }
2455           if (*s == '.')
2456             {
2457               s = parse_field_count (s + 1, &key->schar,
2458                                      N_("invalid number after `.'"));
2459               if (! key->schar--)
2460                 {
2461                   /* Provoke with `sort -k1.0' */
2462                   badfieldspec (optarg, N_("character offset is zero"));
2463                 }
2464             }
2465           if (! (key->sword | key->schar))
2466             key->sword = SIZE_MAX;
2467           s = set_ordering (s, key, bl_start);
2468           if (*s != ',')
2469             {
2470               key->eword = SIZE_MAX;
2471               key->echar = 0;
2472             }
2473           else
2474             {
2475               /* Get POS2. */
2476               s = parse_field_count (s + 1, &key->eword,
2477                                      N_("invalid number after `,'"));
2478               if (! key->eword--)
2479                 {
2480                   /* Provoke with `sort -k1,0' */
2481                   badfieldspec (optarg, N_("field number is zero"));
2482                 }
2483               if (*s == '.')
2484                 s = parse_field_count (s + 1, &key->echar,
2485                                        N_("invalid number after `.'"));
2486               else
2487                 {
2488                   /* `-k 2,3' is equivalent to `+1 -3'.  */
2489                   key->eword++;
2490                 }
2491               s = set_ordering (s, key, bl_end);
2492             }
2493           if (*s)
2494             badfieldspec (optarg, N_("stray character in field spec"));
2495           insertkey (key);
2496           break;
2497
2498         case 'm':
2499           mergeonly = true;
2500           break;
2501
2502         case 'o':
2503           if (outfile && !STREQ (outfile, optarg))
2504             error (SORT_FAILURE, 0, _("multiple output files specified"));
2505           outfile = optarg;
2506           break;
2507
2508         case 's':
2509           stable = true;
2510           break;
2511
2512         case 'S':
2513           specify_sort_size (optarg);
2514           break;
2515
2516         case 't':
2517           {
2518             char newtab = optarg[0];
2519             if (! newtab)
2520               error (SORT_FAILURE, 0, _("empty tab"));
2521             if (optarg[1])
2522               {
2523                 if (STREQ (optarg, "\\0"))
2524                   newtab = '\0';
2525                 else
2526                   {
2527                     /* Provoke with `sort -txx'.  Complain about
2528                        "multi-character tab" instead of "multibyte tab", so
2529                        that the diagnostic's wording does not need to be
2530                        changed once multibyte characters are supported.  */
2531                     error (SORT_FAILURE, 0, _("multi-character tab `%s'"),
2532                            optarg);
2533                   }
2534               }
2535             if (tab != TAB_DEFAULT && tab != newtab)
2536               error (SORT_FAILURE, 0, _("incompatible tabs"));
2537             tab = newtab;
2538           }
2539           break;
2540
2541         case 'T':
2542           add_temp_dir (optarg);
2543           break;
2544
2545         case 'u':
2546           unique = true;
2547           break;
2548
2549         case 'y':
2550           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2551              through Solaris 7.  It is also accepted by many non-Solaris
2552              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2553              -y is marked as obsolete starting with Solaris 8 (1999), but is
2554              still accepted as of Solaris 10 prerelease (2004).
2555
2556              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2557              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2558              and which in general ignores the argument after "-y" if it
2559              consists entirely of digits (it can even be empty).  */
2560           if (!optarg && optind != argc)
2561             {
2562               char const *p;
2563               for (p = argv[optind]; ISDIGIT (*p); p++)
2564                 continue;
2565               optind += !*p;
2566             }
2567           break;
2568
2569         case 'z':
2570           eolchar = 0;
2571           break;
2572
2573         case_GETOPT_HELP_CHAR;
2574
2575         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2576
2577         default:
2578           usage (SORT_FAILURE);
2579         }
2580     }
2581
2582   /* Inheritance of global options to individual keys. */
2583   for (key = keylist; key; key = key->next)
2584     if (! (key->ignore || key->translate
2585            || (key->skipsblanks | key->reverse
2586                | key->skipeblanks | key->month | key->numeric
2587                | key->general_numeric)))
2588       {
2589         key->ignore = gkey.ignore;
2590         key->translate = gkey.translate;
2591         key->skipsblanks = gkey.skipsblanks;
2592         key->skipeblanks = gkey.skipeblanks;
2593         key->month = gkey.month;
2594         key->numeric = gkey.numeric;
2595         key->general_numeric = gkey.general_numeric;
2596         key->reverse = gkey.reverse;
2597       }
2598
2599   if (!keylist && (gkey.ignore || gkey.translate
2600                    || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2601                        | gkey.numeric | gkey.general_numeric)))
2602     insertkey (&gkey);
2603   reverse = gkey.reverse;
2604
2605   if (temp_dir_count == 0)
2606     {
2607       char const *tmp_dir = getenv ("TMPDIR");
2608       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2609     }
2610
2611   if (nfiles == 0)
2612     {
2613       nfiles = 1;
2614       files = &minus;
2615     }
2616
2617   if (checkonly)
2618     {
2619       if (nfiles > 1)
2620         {
2621           error (0, 0, _("extra operand %s not allowed with -c"),
2622                  quote (files[1]));
2623           usage (SORT_FAILURE);
2624         }
2625
2626       /* POSIX requires that sort return 1 IFF invoked with -c and the
2627          input is not properly sorted.  */
2628       exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2629     }
2630
2631   if (mergeonly)
2632     merge (files, 0, nfiles, outfile);
2633   else
2634     sort (files, nfiles, outfile);
2635
2636   if (have_read_stdin && fclose (stdin) == EOF)
2637     die (_("close failed"), "-");
2638
2639   exit (EXIT_SUCCESS);
2640 }