src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 88, 1991-2004 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17
  18    Written December 1988 by Mike Haertel.
  19    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  20    or (US mail) as Mike Haertel c/o Free Software Foundation.
  21
  22    Ørn E. Hansen added NLS support in 1997.  */
  23
  24 #include <config.h>
  25
  26 #include <getopt.h>
  27 #include <sys/types.h>
  28 #include <signal.h>
  29 #include <stdio.h>
  30 #include "system.h"
  31 #include "error.h"
  32 #include "hard-locale.h"
  33 #include "inttostr.h"
  34 #include "long-options.h"
  35 #include "physmem.h"
  36 #include "posixver.h"
  37 #include "stdio-safer.h"
  38 #include "xmemcoll.h"
  39 #include "xstrtol.h"
  40
  41 #if HAVE_SYS_RESOURCE_H
  42 # include <sys/resource.h>
  43 #endif
  44 #ifndef RLIMIT_DATA
  45 struct rlimit { size_t rlim_cur; };
  46 # define getrlimit(Resource, Rlp) (-1)
  47 #endif
  48
  49 /* The official name of this program (e.g., no `g' prefix).  */
  50 #define PROGRAM_NAME "sort"
  51
  52 #define AUTHORS "Mike Haertel", "Paul Eggert"
  53
  54 #if HAVE_LANGINFO_CODESET
  55 # include <langinfo.h>
  56 #endif
  57
  58 #ifndef SA_NOCLDSTOP
  59 # define sigprocmask(How, Set, Oset) /* empty */
  60 # define sigset_t int
  61 #endif
  62
  63 #ifndef STDC_HEADERS
  64 double strtod ();
  65 #endif
  66
  67 #define UCHAR_LIM (UCHAR_MAX + 1)
  68 #define UCHAR(c) ((unsigned char) (c))
  69
  70 #ifndef DEFAULT_TMPDIR
  71 # define DEFAULT_TMPDIR "/tmp"
  72 #endif
  73
  74 /* Exit statuses.  */
  75 enum
  76   {
  77     /* POSIX says to exit with status 1 if invoked with -c and the
  78        input is not properly sorted.  */
  79     SORT_OUT_OF_ORDER = 1,
  80
  81     /* POSIX says any other irregular exit must exit with a status
  82        code greater than 1.  */
  83     SORT_FAILURE = 2
  84   };
  85
  86 #define C_DECIMAL_POINT '.'
  87 #define NEGATION_SIGN   '-'
  88 #define NUMERIC_ZERO    '0'
  89
  90 #if HAVE_SETLOCALE
  91
  92 static char decimal_point;
  93 static int th_sep; /* if CHAR_MAX + 1, then there is no thousands separator */
  94
  95 /* Nonzero if the corresponding locales are hard.  */
  96 static bool hard_LC_COLLATE;
  97 # if HAVE_NL_LANGINFO
  98 static bool hard_LC_TIME;
  99 # endif
 100
 101 # define IS_THOUSANDS_SEP(x) ((x) == th_sep)
 102
 103 #else
 104
 105 # define decimal_point C_DECIMAL_POINT
 106 # define IS_THOUSANDS_SEP(x) 0
 107
 108 #endif
 109
 110 #define NONZERO(x) (x != 0)
 111
 112 /* The kind of blanks for '-b' to skip in various options. */
 113 enum blanktype { bl_start, bl_end, bl_both };
 114
 115 /* The character marking end of line. Default to \n. */
 116 static char eolchar = '\n';
 117
 118 /* Lines are held in core as counted strings. */
 119 struct line
 120 {
 121   char *text;                   /* Text of the line. */
 122   size_t length;                /* Length including final newline. */
 123   char *keybeg;                 /* Start of first key. */
 124   char *keylim;                 /* Limit of first key. */
 125 };
 126
 127 /* Input buffers. */
 128 struct buffer
 129 {
 130   char *buf;                    /* Dynamically allocated buffer,
 131                                    partitioned into 3 regions:
 132                                    - input data;
 133                                    - unused area;
 134                                    - an array of lines, in reverse order.  */
 135   size_t used;                  /* Number of bytes used for input data.  */
 136   size_t nlines;                /* Number of lines in the line array.  */
 137   size_t alloc;                 /* Number of bytes allocated. */
 138   size_t left;                  /* Number of bytes left from previous reads. */
 139   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 140   bool eof;                     /* An EOF has been read.  */
 141 };
 142
 143 struct keyfield
 144 {
 145   size_t sword;                 /* Zero-origin 'word' to start at. */
 146   size_t schar;                 /* Additional characters to skip. */
 147   size_t eword;                 /* Zero-origin first word after field. */
 148   size_t echar;                 /* Additional characters in field. */
 149   bool const *ignore;           /* Boolean array of characters to ignore. */
 150   char const *translate;        /* Translation applied to characters. */
 151   bool skipsblanks;             /* Skip leading blanks at start. */
 152   bool skipeblanks;             /* Skip trailing blanks at finish. */
 153   bool numeric;                 /* Flag for numeric comparison.  Handle
 154                                    strings of digits with optional decimal
 155                                    point, but no exponential notation. */
 156   bool general_numeric;         /* Flag for general, numeric comparison.
 157                                    Handle numbers in exponential notation. */
 158   bool month;                   /* Flag for comparison by month name. */
 159   bool reverse;                 /* Reverse the sense of comparison. */
 160   struct keyfield *next;        /* Next keyfield to try. */
 161 };
 162
 163 struct month
 164 {
 165   char const *name;
 166   int val;
 167 };
 168
 169 /* The name this program was run with. */
 170 char *program_name;
 171
 172 /* FIXME: None of these tables work with multibyte character sets.
 173    Also, there are many other bugs when handling multibyte characters.
 174    One way to fix this is to rewrite `sort' to use wide characters
 175    internally, but doing this with good performance is a bit
 176    tricky.  */
 177
 178 /* Table of blanks.  */
 179 static bool blanks[UCHAR_LIM];
 180
 181 /* Table of non-printing characters. */
 182 static bool nonprinting[UCHAR_LIM];
 183
 184 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 185 static bool nondictionary[UCHAR_LIM];
 186
 187 /* Translation table folding lower case to upper.  */
 188 static char fold_toupper[UCHAR_LIM];
 189
 190 #define MONTHS_PER_YEAR 12
 191
 192 /* Table mapping month names to integers.
 193    Alphabetic order allows binary search. */
 194 static struct month monthtab[] =
 195 {
 196   {"APR", 4},
 197   {"AUG", 8},
 198   {"DEC", 12},
 199   {"FEB", 2},
 200   {"JAN", 1},
 201   {"JUL", 7},
 202   {"JUN", 6},
 203   {"MAR", 3},
 204   {"MAY", 5},
 205   {"NOV", 11},
 206   {"OCT", 10},
 207   {"SEP", 9}
 208 };
 209
 210 /* During the merge phase, the number of files to merge at once. */
 211 #define NMERGE 16
 212
 213 /* Minimum size for a merge or check buffer.  */
 214 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 215
 216 /* Minimum sort size; the code might not work with smaller sizes.  */
 217 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
 218
 219 /* The number of bytes needed for a merge or check buffer, which can
 220    function relatively efficiently even if it holds only one line.  If
 221    a longer line is seen, this value is increased.  */
 222 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 223
 224 /* The approximate maximum number of bytes of main memory to use, as
 225    specified by the user.  Zero if the user has not specified a size.  */
 226 static size_t sort_size;
 227
 228 /* The guessed size for non-regular files.  */
 229 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 230
 231 /* Array of directory names in which any temporary files are to be created. */
 232 static char const **temp_dirs;
 233
 234 /* Number of temporary directory names used.  */
 235 static size_t temp_dir_count;
 236
 237 /* Number of allocated slots in temp_dirs.  */
 238 static size_t temp_dir_alloc;
 239
 240 /* Flag to reverse the order of all comparisons. */
 241 static bool reverse;
 242
 243 /* Flag for stable sort.  This turns off the last ditch bytewise
 244    comparison of lines, and instead leaves lines in the same order
 245    they were read if all keys compare equal.  */
 246 static bool stable;
 247
 248 /* If TAB has this value, blanks separate fields.  */
 249 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 250
 251 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 252    separated by the empty string between a non-blank character and a blank
 253    character. */
 254 static int tab = TAB_DEFAULT;
 255
 256 /* Flag to remove consecutive duplicate lines from the output.
 257    Only the last of a sequence of equal lines will be output. */
 258 static bool unique;
 259
 260 /* Nonzero if any of the input files are the standard input. */
 261 static bool have_read_stdin;
 262
 263 /* List of key field comparisons to be tried.  */
 264 static struct keyfield *keylist;
 265
 266 static void sortlines_temp (struct line *, size_t, struct line *);
 267
 268 void
 269 usage (int status)
 270 {
 271   if (status != EXIT_SUCCESS)
 272     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 273              program_name);
 274   else
 275     {
 276       printf (_("\
 277 Usage: %s [OPTION]... [FILE]...\n\
 278 "),
 279               program_name);
 280       fputs (_("\
 281 Write sorted concatenation of all FILE(s) to standard output.\n\
 282 \n\
 283 Ordering options:\n\
 284 \n\
 285 "), stdout);
 286       fputs (_("\
 287 Mandatory arguments to long options are mandatory for short options too.\n\
 288 "), stdout);
 289       fputs (_("\
 290   -b, --ignore-leading-blanks ignore leading blanks\n\
 291   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 292   -f, --ignore-case           fold lower case to upper case characters\n\
 293 "), stdout);
 294       fputs (_("\
 295   -g, --general-numeric-sort  compare according to general numerical value\n\
 296   -i, --ignore-nonprinting    consider only printable characters\n\
 297   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 298   -n, --numeric-sort          compare according to string numerical value\n\
 299   -r, --reverse               reverse the result of comparisons\n\
 300 \n\
 301 "), stdout);
 302       fputs (_("\
 303 Other options:\n\
 304 \n\
 305   -c, --check               check whether input is sorted; do not sort\n\
 306   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS 2 (origin 1)\n\
 307   -m, --merge               merge already sorted files; do not sort\n\
 308   -o, --output=FILE         write result to FILE instead of standard output\n\
 309   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 310   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 311 "), stdout);
 312       printf (_("\
 313   -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
 314   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 315                               multiple options specify multiple directories\n\
 316   -u, --unique              with -c, check for strict ordering;\n\
 317                               without -c, output only the first of an equal run\n\
 318 "), DEFAULT_TMPDIR);
 319       fputs (_("\
 320   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 321 "), stdout);
 322       fputs (HELP_OPTION_DESCRIPTION, stdout);
 323       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 324       fputs (_("\
 325 \n\
 326 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 327 in the field.  OPTS is one or more single-letter ordering options, which\n\
 328 override global ordering options for that key.  If no key is given, use the\n\
 329 entire line as the key.\n\
 330 \n\
 331 SIZE may be followed by the following multiplicative suffixes:\n\
 332 "), stdout);
 333       fputs (_("\
 334 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 335 \n\
 336 With no FILE, or when FILE is -, read standard input.\n\
 337 \n\
 338 *** WARNING ***\n\
 339 The locale specified by the environment affects sort order.\n\
 340 Set LC_ALL=C to get the traditional sort order that uses\n\
 341 native byte values.\n\
 342 "), stdout );
 343       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 344     }
 345
 346   exit (status);
 347 }
 348
 349 #define COMMON_SHORT_OPTIONS "-bcdfgik:mMno:rsS:t:T:uz"
 350
 351 static struct option const long_options[] =
 352 {
 353   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 354   {"check", no_argument, NULL, 'c'},
 355   {"dictionary-order", no_argument, NULL, 'd'},
 356   {"ignore-case", no_argument, NULL, 'f'},
 357   {"general-numeric-sort", no_argument, NULL, 'g'},
 358   {"ignore-nonprinting", no_argument, NULL, 'i'},
 359   {"key", required_argument, NULL, 'k'},
 360   {"merge", no_argument, NULL, 'm'},
 361   {"month-sort", no_argument, NULL, 'M'},
 362   {"numeric-sort", no_argument, NULL, 'n'},
 363   {"output", required_argument, NULL, 'o'},
 364   {"reverse", no_argument, NULL, 'r'},
 365   {"stable", no_argument, NULL, 's'},
 366   {"buffer-size", required_argument, NULL, 'S'},
 367   {"field-separator", required_argument, NULL, 't'},
 368   {"temporary-directory", required_argument, NULL, 'T'},
 369   {"unique", no_argument, NULL, 'u'},
 370   {"zero-terminated", no_argument, NULL, 'z'},
 371   {GETOPT_HELP_OPTION_DECL},
 372   {GETOPT_VERSION_OPTION_DECL},
 373   {0, 0, 0, 0},
 374 };
 375
 376 /* The set of signals that are caught.  */
 377 static sigset_t caught_signals;
 378
 379 /* The list of temporary files. */
 380 struct tempnode
 381 {
 382   struct tempnode *volatile next;
 383   char name[1];  /* Actual size is 1 + file name length.  */
 384 };
 385 static struct tempnode *volatile temphead;
 386
 387 /* Clean up any remaining temporary files. */
 388
 389 static void
 390 cleanup (void)
 391 {
 392   struct tempnode const *node;
 393
 394   for (node = temphead; node; node = node->next)
 395     unlink (node->name);
 396 }
 397
 398 /* Report MESSAGE for FILE, then clean up and exit.  */
 399
 400 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 401 static void
 402 die (char const *message, char const *file)
 403 {
 404   error (0, errno, "%s: %s", message, file);
 405   exit (SORT_FAILURE);
 406 }
 407
 408 /* Create a new temporary file, returning its newly allocated name.
 409    Store into *PFP a stream open for writing.  */
 410
 411 static char *
 412 create_temp_file (FILE **pfp)
 413 {
 414   static char const slashbase[] = "/sortXXXXXX";
 415   static size_t temp_dir_index;
 416   sigset_t oldset;
 417   int fd;
 418   int saved_errno;
 419   char const *temp_dir = temp_dirs[temp_dir_index];
 420   size_t len = strlen (temp_dir);
 421   struct tempnode *node =
 422     xmalloc (sizeof node->next + len + sizeof slashbase);
 423   char *file = node->name;
 424
 425   memcpy (file, temp_dir, len);
 426   memcpy (file + len, slashbase, sizeof slashbase);
 427   node->next = temphead;
 428   if (++temp_dir_index == temp_dir_count)
 429     temp_dir_index = 0;
 430
 431   /* Create the temporary file in a critical section, to avoid races.  */
 432   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 433   fd = mkstemp (file);
 434   if (0 <= fd)
 435     temphead = node;
 436   saved_errno = errno;
 437   sigprocmask (SIG_SETMASK, &oldset, NULL);
 438   errno = saved_errno;
 439
 440   if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
 441     die (_("cannot create temporary file"), file);
 442
 443   return file;
 444 }
 445
 446 static FILE *
 447 xfopen (const char *file, const char *how)
 448 {
 449   FILE *fp;
 450
 451   if (STREQ (file, "-"))
 452     {
 453       if (*how == 'r')
 454         {
 455           have_read_stdin = true;
 456           fp = stdin;
 457         }
 458       else
 459         fp = stdout;
 460     }
 461   else
 462     {
 463       if ((fp = fopen_safer (file, how)) == NULL)
 464         die (_("open failed"), file);
 465     }
 466
 467   return fp;
 468 }
 469
 470 /* Close FP, whose name is FILE, and report any errors.  */
 471
 472 static void
 473 xfclose (FILE *fp, char const *file)
 474 {
 475   if (fp == stdin)
 476     {
 477       /* Allow reading stdin from tty more than once. */
 478       if (feof (fp))
 479         clearerr (fp);
 480     }
 481   else
 482     {
 483       if (fclose (fp) != 0)
 484         die (_("close failed"), file);
 485     }
 486 }
 487
 488 static void
 489 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 490 {
 491   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 492     die (_("write failed"), output_file);
 493 }
 494
 495 /* Append DIR to the array of temporary directory names.  */
 496 static void
 497 add_temp_dir (char const *dir)
 498 {
 499   if (temp_dir_count == temp_dir_alloc)
 500     temp_dirs = x2nrealloc (temp_dirs, &temp_dir_alloc, sizeof *temp_dirs);
 501
 502   temp_dirs[temp_dir_count++] = dir;
 503 }
 504
 505 /* Search through the list of temporary files for NAME;
 506    remove it if it is found on the list. */
 507
 508 static void
 509 zaptemp (const char *name)
 510 {
 511   struct tempnode *volatile *pnode;
 512   struct tempnode *node;
 513
 514   for (pnode = &temphead; (node = *pnode); pnode = &node->next)
 515     if (node->name == name)
 516       {
 517         unlink (name);
 518         *pnode = node->next;
 519         free (node);
 520         break;
 521       }
 522 }
 523
 524 #if HAVE_NL_LANGINFO
 525
 526 static int
 527 struct_month_cmp (const void *m1, const void *m2)
 528 {
 529   struct month const *month1 = m1;
 530   struct month const *month2 = m2;
 531   return strcmp (month1->name, month2->name);
 532 }
 533
 534 #endif
 535
 536 /* Initialize the character class tables. */
 537
 538 static void
 539 inittables (void)
 540 {
 541   int i;
 542
 543   for (i = 0; i < UCHAR_LIM; ++i)
 544     {
 545       blanks[i] = !!ISBLANK (i);
 546       nonprinting[i] = !ISPRINT (i);
 547       nondictionary[i] = !ISALNUM (i) && !ISBLANK (i);
 548       fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
 549     }
 550
 551 #if HAVE_NL_LANGINFO
 552   /* If we're not in the "C" locale, read different names for months.  */
 553   if (hard_LC_TIME)
 554     {
 555       for (i = 0; i < MONTHS_PER_YEAR; i++)
 556         {
 557           char const *s;
 558           size_t s_len;
 559           size_t j;
 560           char *name;
 561
 562           s = (char *) nl_langinfo (ABMON_1 + i);
 563           s_len = strlen (s);
 564           monthtab[i].name = name = xmalloc (s_len + 1);
 565           monthtab[i].val = i + 1;
 566
 567           for (j = 0; j < s_len; j++)
 568             name[j] = fold_toupper[UCHAR (s[j])];
 569           name[j] = '\0';
 570         }
 571       qsort ((void *) monthtab, MONTHS_PER_YEAR,
 572              sizeof *monthtab, struct_month_cmp);
 573     }
 574 #endif
 575 }
 576
 577 /* Specify the amount of main memory to use when sorting.  */
 578 static void
 579 specify_sort_size (char const *s)
 580 {
 581   uintmax_t n;
 582   char *suffix;
 583   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
 584
 585   /* The default unit is KiB.  */
 586   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
 587     {
 588       if (n <= UINTMAX_MAX / 1024)
 589         n *= 1024;
 590       else
 591         e = LONGINT_OVERFLOW;
 592     }
 593
 594   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
 595   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
 596     switch (suffix[0])
 597       {
 598       case 'b':
 599         e = LONGINT_OK;
 600         break;
 601
 602       case '%':
 603         {
 604           double mem = physmem_total () * n / 100;
 605
 606           /* Use "<", not "<=", to avoid problems with rounding.  */
 607           if (mem < UINTMAX_MAX)
 608             {
 609               n = mem;
 610               e = LONGINT_OK;
 611             }
 612           else
 613             e = LONGINT_OVERFLOW;
 614         }
 615         break;
 616       }
 617
 618   if (e == LONGINT_OK)
 619     {
 620       /* If multiple sort sizes are specified, take the maximum, so
 621          that option order does not matter.  */
 622       if (n < sort_size)
 623         return;
 624
 625       sort_size = n;
 626       if (sort_size == n)
 627         {
 628           sort_size = MAX (sort_size, MIN_SORT_SIZE);
 629           return;
 630         }
 631
 632       e = LONGINT_OVERFLOW;
 633     }
 634
 635   STRTOL_FATAL_ERROR (s, _("sort size"), e);
 636 }
 637
 638 /* Return the default sort size.  */
 639 static size_t
 640 default_sort_size (void)
 641 {
 642   /* Let MEM be available memory or 1/8 of total memory, whichever
 643      is greater.  */
 644   double avail = physmem_available ();
 645   double total = physmem_total ();
 646   double mem = MAX (avail, total / 8);
 647   struct rlimit rlimit;
 648
 649   /* Let SIZE be MEM, but no more than the maximum object size or
 650      system resource limits.  Avoid the MIN macro here, as it is not
 651      quite right when only one argument is floating point.  Don't
 652      bother to check for values like RLIM_INFINITY since in practice
 653      they are not much less than SIZE_MAX.  */
 654   size_t size = SIZE_MAX;
 655   if (mem < size)
 656     size = mem;
 657   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
 658     size = rlimit.rlim_cur;
 659 #ifdef RLIMIT_AS
 660   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
 661     size = rlimit.rlim_cur;
 662 #endif
 663
 664   /* Leave a large safety margin for the above limits, as failure can
 665      occur when they are exceeded.  */
 666   size /= 2;
 667
 668 #ifdef RLIMIT_RSS
 669   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
 670      Exceeding RSS is not fatal, but can be quite slow.  */
 671   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
 672     size = rlimit.rlim_cur / 16 * 15;
 673 #endif
 674
 675   /* Use no less than the minimum.  */
 676   return MAX (size, MIN_SORT_SIZE);
 677 }
 678
 679 /* Return the sort buffer size to use with the input files identified
 680    by FPS and FILES, which are alternate paths to the same files.
 681    NFILES gives the number of input files; NFPS may be less.  Assume
 682    that each input line requires LINE_BYTES extra bytes' worth of line
 683    information.  Do not exceed a bound on the size: if the bound is
 684    not specified by the user, use a default.  */
 685
 686 static size_t
 687 sort_buffer_size (FILE *const *fps, int nfps,
 688                   char *const *files, int nfiles,
 689                   size_t line_bytes)
 690 {
 691   /* A bound on the input size.  If zero, the bound hasn't been
 692      determined yet.  */
 693   static size_t size_bound;
 694
 695   /* In the worst case, each input byte is a newline.  */
 696   size_t worst_case_per_input_byte = line_bytes + 1;
 697
 698   /* Keep enough room for one extra input line and an extra byte.
 699      This extra room might be needed when preparing to read EOF.  */
 700   size_t size = worst_case_per_input_byte + 1;
 701
 702   int i;
 703
 704   for (i = 0; i < nfiles; i++)
 705     {
 706       struct stat st;
 707       off_t file_size;
 708       size_t worst_case;
 709
 710       if ((i < nfps ? fstat (fileno (fps[i]), &st)
 711            : strcmp (files[i], "-") == 0 ? fstat (STDIN_FILENO, &st)
 712            : stat (files[i], &st))
 713           != 0)
 714         die (_("stat failed"), files[i]);
 715
 716       if (S_ISREG (st.st_mode))
 717         file_size = st.st_size;
 718       else
 719         {
 720           /* The file has unknown size.  If the user specified a sort
 721              buffer size, use that; otherwise, guess the size.  */
 722           if (sort_size)
 723             return sort_size;
 724           file_size = INPUT_FILE_SIZE_GUESS;
 725         }
 726
 727       if (! size_bound)
 728         {
 729           size_bound = sort_size;
 730           if (! size_bound)
 731             size_bound = default_sort_size ();
 732         }
 733
 734       /* Add the amount of memory needed to represent the worst case
 735          where the input consists entirely of newlines followed by a
 736          single non-newline.  Check for overflow.  */
 737       worst_case = file_size * worst_case_per_input_byte + 1;
 738       if (file_size != worst_case / worst_case_per_input_byte
 739           || size_bound - size <= worst_case)
 740         return size_bound;
 741       size += worst_case;
 742     }
 743
 744   return size;
 745 }
 746
 747 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
 748    must be at least sizeof (struct line).  Allocate ALLOC bytes
 749    initially.  */
 750
 751 static void
 752 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
 753 {
 754   /* Ensure that the line array is properly aligned.  If the desired
 755      size cannot be allocated, repeatedly halve it until allocation
 756      succeeds.  The smaller allocation may hurt overall performance,
 757      but that's better than failing.  */
 758   for (;;)
 759     {
 760       alloc += sizeof (struct line) - alloc % sizeof (struct line);
 761       buf->buf = malloc (alloc);
 762       if (buf->buf)
 763         break;
 764       alloc /= 2;
 765       if (alloc <= line_bytes + 1)
 766         xalloc_die ();
 767     }
 768
 769   buf->line_bytes = line_bytes;
 770   buf->alloc = alloc;
 771   buf->used = buf->left = buf->nlines = 0;
 772   buf->eof = false;
 773 }
 774
 775 /* Return one past the limit of the line array.  */
 776
 777 static inline struct line *
 778 buffer_linelim (struct buffer const *buf)
 779 {
 780   return (struct line *) (buf->buf + buf->alloc);
 781 }
 782
 783 /* Return a pointer to the first character of the field specified
 784    by KEY in LINE. */
 785
 786 static char *
 787 begfield (const struct line *line, const struct keyfield *key)
 788 {
 789   register char *ptr = line->text, *lim = ptr + line->length - 1;
 790   register size_t sword = key->sword;
 791   register size_t schar = key->schar;
 792   register size_t remaining_bytes;
 793
 794   /* The leading field separator itself is included in a field when -t
 795      is absent.  */
 796
 797   if (tab != TAB_DEFAULT)
 798     while (ptr < lim && sword--)
 799       {
 800         while (ptr < lim && *ptr != tab)
 801           ++ptr;
 802         if (ptr < lim)
 803           ++ptr;
 804       }
 805   else
 806     while (ptr < lim && sword--)
 807       {
 808         while (ptr < lim && blanks[UCHAR (*ptr)])
 809           ++ptr;
 810         while (ptr < lim && !blanks[UCHAR (*ptr)])
 811           ++ptr;
 812       }
 813
 814   if (key->skipsblanks)
 815     while (ptr < lim && blanks[UCHAR (*ptr)])
 816       ++ptr;
 817
 818   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
 819   remaining_bytes = lim - ptr;
 820   if (schar < remaining_bytes)
 821     ptr += schar;
 822   else
 823     ptr = lim;
 824
 825   return ptr;
 826 }
 827
 828 /* Return the limit of (a pointer to the first character after) the field
 829    in LINE specified by KEY. */
 830
 831 static char *
 832 limfield (const struct line *line, const struct keyfield *key)
 833 {
 834   register char *ptr = line->text, *lim = ptr + line->length - 1;
 835   register size_t eword = key->eword, echar = key->echar;
 836   register size_t remaining_bytes;
 837
 838   /* Move PTR past EWORD fields or to one past the last byte on LINE,
 839      whichever comes first.  If there are more than EWORD fields, leave
 840      PTR pointing at the beginning of the field having zero-based index,
 841      EWORD.  If a delimiter character was specified (via -t), then that
 842      `beginning' is the first character following the delimiting TAB.
 843      Otherwise, leave PTR pointing at the first `blank' character after
 844      the preceding field.  */
 845   if (tab != TAB_DEFAULT)
 846     while (ptr < lim && eword--)
 847       {
 848         while (ptr < lim && *ptr != tab)
 849           ++ptr;
 850         if (ptr < lim && (eword | echar))
 851           ++ptr;
 852       }
 853   else
 854     while (ptr < lim && eword--)
 855       {
 856         while (ptr < lim && blanks[UCHAR (*ptr)])
 857           ++ptr;
 858         while (ptr < lim && !blanks[UCHAR (*ptr)])
 859           ++ptr;
 860       }
 861
 862 #ifdef POSIX_UNSPECIFIED
 863   /* The following block of code makes GNU sort incompatible with
 864      standard Unix sort, so it's ifdef'd out for now.
 865      The POSIX spec isn't clear on how to interpret this.
 866      FIXME: request clarification.
 867
 868      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
 869      Date: Thu, 30 May 96 12:20:41 -0400
 870      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
 871
 872      [...]I believe I've found another bug in `sort'.
 873
 874      $ cat /tmp/sort.in
 875      a b c 2 d
 876      pq rs 1 t
 877      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
 878      a b c 2 d
 879      pq rs 1 t
 880      $ /bin/sort -k1.7,1.7 </tmp/sort.in
 881      pq rs 1 t
 882      a b c 2 d
 883
 884      Unix sort produced the answer I expected: sort on the single character
 885      in column 7.  GNU sort produced different results, because it disagrees
 886      on the interpretation of the key-end spec "M.N".  Unix sort reads this
 887      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
 888      "skip M-1 fields, then either N-1 characters or the rest of the current
 889      field, whichever comes first".  This extra clause applies only to
 890      key-ends, not key-starts.
 891      */
 892
 893   /* Make LIM point to the end of (one byte past) the current field.  */
 894   if (tab != TAB_DEFAULT)
 895     {
 896       char *newlim;
 897       newlim = memchr (ptr, tab, lim - ptr);
 898       if (newlim)
 899         lim = newlim;
 900     }
 901   else
 902     {
 903       char *newlim;
 904       newlim = ptr;
 905       while (newlim < lim && blanks[UCHAR (*newlim)])
 906         ++newlim;
 907       while (newlim < lim && !blanks[UCHAR (*newlim)])
 908         ++newlim;
 909       lim = newlim;
 910     }
 911 #endif
 912
 913   /* If we're skipping leading blanks, don't start counting characters
 914      until after skipping past any leading blanks.  */
 915   if (key->skipsblanks)
 916     while (ptr < lim && blanks[UCHAR (*ptr)])
 917       ++ptr;
 918
 919   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
 920   remaining_bytes = lim - ptr;
 921   if (echar < remaining_bytes)
 922     ptr += echar;
 923   else
 924     ptr = lim;
 925
 926   return ptr;
 927 }
 928
 929 /* Return the number of trailing blanks in FIELD, with LEN bytes.  */
 930
 931 static size_t
 932 trailing_blanks (char const *field, size_t len)
 933 {
 934   size_t i;
 935   for (i = len; 0 < i && blanks[UCHAR (field[i - 1])]; i--)
 936     continue;
 937   return len - i;
 938 }
 939
 940 /* Fill BUF reading from FP, moving buf->left bytes from the end
 941    of buf->buf to the beginning first.  If EOF is reached and the
 942    file wasn't terminated by a newline, supply one.  Set up BUF's line
 943    table too.  FILE is the name of the file corresponding to FP.
 944    Return true if some input was read.  */
 945
 946 static bool
 947 fillbuf (struct buffer *buf, register FILE *fp, char const *file)
 948 {
 949   struct keyfield const *key = keylist;
 950   char eol = eolchar;
 951   size_t line_bytes = buf->line_bytes;
 952   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
 953
 954   if (buf->eof)
 955     return false;
 956
 957   if (buf->used != buf->left)
 958     {
 959       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
 960       buf->used = buf->left;
 961       buf->nlines = 0;
 962     }
 963
 964   for (;;)
 965     {
 966       char *ptr = buf->buf + buf->used;
 967       struct line *linelim = buffer_linelim (buf);
 968       struct line *line = linelim - buf->nlines;
 969       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
 970       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
 971
 972       while (line_bytes + 1 < avail)
 973         {
 974           /* Read as many bytes as possible, but do not read so many
 975              bytes that there might not be enough room for the
 976              corresponding line array.  The worst case is when the
 977              rest of the input file consists entirely of newlines,
 978              except that the last byte is not a newline.  */
 979           size_t readsize = (avail - 1) / (line_bytes + 1);
 980           size_t bytes_read = fread (ptr, 1, readsize, fp);
 981           char *ptrlim = ptr + bytes_read;
 982           char *p;
 983           avail -= bytes_read;
 984
 985           if (bytes_read != readsize)
 986             {
 987               if (ferror (fp))
 988                 die (_("read failed"), file);
 989               if (feof (fp))
 990                 {
 991                   buf->eof = true;
 992                   if (buf->buf == ptrlim)
 993                     return false;
 994                   if (ptrlim[-1] != eol)
 995                     *ptrlim++ = eol;
 996                 }
 997             }
 998
 999           /* Find and record each line in the just-read input.  */
1000           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1001             {
1002               ptr = p + 1;
1003               line--;
1004               line->text = line_start;
1005               line->length = ptr - line_start;
1006               mergesize = MAX (mergesize, line->length);
1007               avail -= line_bytes;
1008
1009               if (key)
1010                 {
1011                   /* Precompute the position of the first key for
1012                      efficiency. */
1013                   line->keylim = (key->eword == SIZE_MAX
1014                                   ? p
1015                                   : limfield (line, key));
1016
1017                   if (key->sword != SIZE_MAX)
1018                     line->keybeg = begfield (line, key);
1019                   else
1020                     {
1021                       if (key->skipsblanks)
1022                         while (blanks[UCHAR (*line_start)])
1023                           line_start++;
1024                       line->keybeg = line_start;
1025                     }
1026                   if (key->skipeblanks)
1027                     {
1028                       size_t keylen = line->keylim - line->keybeg;
1029                       line->keylim -= trailing_blanks (line->keybeg, keylen);
1030                     }
1031                 }
1032
1033               line_start = ptr;
1034             }
1035
1036           ptr = ptrlim;
1037           if (buf->eof)
1038             break;
1039         }
1040
1041       buf->used = ptr - buf->buf;
1042       buf->nlines = buffer_linelim (buf) - line;
1043       if (buf->nlines != 0)
1044         {
1045           buf->left = ptr - line_start;
1046           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1047           return true;
1048         }
1049
1050       /* The current input line is too long to fit in the buffer.
1051          Double the buffer size and try again.  */
1052       buf->buf = x2nrealloc (buf->buf, &buf->alloc, sizeof *(buf->buf));
1053     }
1054 }
1055
1056 /* Compare strings A and B containing decimal fractions < 1.  Each string
1057    should begin with a decimal point followed immediately by the digits
1058    of the fraction.  Strings not of this form are considered to be zero. */
1059
1060 /* The goal here, is to take two numbers a and b... compare these
1061    in parallel.  Instead of converting each, and then comparing the
1062    outcome.  Most likely stopping the comparison before the conversion
1063    is complete.  The algorithm used, in the old sort:
1064
1065    Algorithm: fraccompare
1066    Action   : compare two decimal fractions
1067    accepts  : char *a, char *b
1068    returns  : -1 if a<b, 0 if a=b, 1 if a>b.
1069    implement:
1070
1071    if *a == decimal_point AND *b == decimal_point
1072      find first character different in a and b.
1073      if both are digits, return the difference *a - *b.
1074      if *a is a digit
1075        skip past zeros
1076        if digit return 1, else 0
1077      if *b is a digit
1078        skip past zeros
1079        if digit return -1, else 0
1080    if *a is a decimal_point
1081      skip past decimal_point and zeros
1082      if digit return 1, else 0
1083    if *b is a decimal_point
1084      skip past decimal_point and zeros
1085      if digit return -1, else 0
1086    return 0 */
1087
1088 static int
1089 fraccompare (register const char *a, register const char *b)
1090 {
1091   if (*a == decimal_point && *b == decimal_point)
1092     {
1093       while (*++a == *++b)
1094         if (! ISDIGIT (*a))
1095           return 0;
1096       if (ISDIGIT (*a) && ISDIGIT (*b))
1097         return *a - *b;
1098       if (ISDIGIT (*a))
1099         goto a_trailing_nonzero;
1100       if (ISDIGIT (*b))
1101         goto b_trailing_nonzero;
1102       return 0;
1103     }
1104   else if (*a++ == decimal_point)
1105     {
1106     a_trailing_nonzero:
1107       while (*a == NUMERIC_ZERO)
1108         a++;
1109       return ISDIGIT (*a);
1110     }
1111   else if (*b++ == decimal_point)
1112     {
1113     b_trailing_nonzero:
1114       while (*b == NUMERIC_ZERO)
1115         b++;
1116       return - ISDIGIT (*b);
1117     }
1118   return 0;
1119 }
1120
1121 /* Compare strings A and B as numbers without explicitly converting them to
1122    machine numbers.  Comparatively slow for short strings, but asymptotically
1123    hideously fast. */
1124
1125 static int
1126 numcompare (register const char *a, register const char *b)
1127 {
1128   register int tmpa, tmpb, tmp;
1129   register size_t log_a, log_b;
1130
1131   tmpa = *a;
1132   tmpb = *b;
1133
1134   while (blanks[UCHAR (tmpa)])
1135     tmpa = *++a;
1136   while (blanks[UCHAR (tmpb)])
1137     tmpb = *++b;
1138
1139   if (tmpa == NEGATION_SIGN)
1140     {
1141       do
1142         tmpa = *++a;
1143       while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa));
1144       if (tmpb != NEGATION_SIGN)
1145         {
1146           if (tmpa == decimal_point)
1147             do
1148               tmpa = *++a;
1149             while (tmpa == NUMERIC_ZERO);
1150           if (ISDIGIT (tmpa))
1151             return -1;
1152           while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
1153             tmpb = *++b;
1154           if (tmpb == decimal_point)
1155             do
1156               tmpb = *++b;
1157             while (tmpb == NUMERIC_ZERO);
1158           if (ISDIGIT (tmpb))
1159             return -1;
1160           return 0;
1161         }
1162       do
1163         tmpb = *++b;
1164       while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
1165
1166       while (tmpa == tmpb && ISDIGIT (tmpa))
1167         {
1168           do
1169             tmpa = *++a;
1170           while (IS_THOUSANDS_SEP (tmpa));
1171           do
1172             tmpb = *++b;
1173           while (IS_THOUSANDS_SEP (tmpb));
1174         }
1175
1176       if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1177           || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1178         return -fraccompare (a, b);
1179
1180       tmp = tmpb - tmpa;
1181
1182       for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1183         do
1184           tmpa = *++a;
1185         while (IS_THOUSANDS_SEP (tmpa));
1186
1187       for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1188         do
1189           tmpb = *++b;
1190         while (IS_THOUSANDS_SEP (tmpb));
1191
1192       if (log_a != log_b)
1193         return log_a < log_b ? 1 : -1;
1194
1195       if (!log_a)
1196         return 0;
1197
1198       return tmp;
1199     }
1200   else if (tmpb == NEGATION_SIGN)
1201     {
1202       do
1203         tmpb = *++b;
1204       while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
1205       if (tmpb == decimal_point)
1206         do
1207           tmpb = *++b;
1208         while (tmpb == NUMERIC_ZERO);
1209       if (ISDIGIT (tmpb))
1210         return 1;
1211       while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
1212         tmpa = *++a;
1213       if (tmpa == decimal_point)
1214         do
1215           tmpa = *++a;
1216         while (tmpa == NUMERIC_ZERO);
1217       if (ISDIGIT (tmpa))
1218         return 1;
1219       return 0;
1220     }
1221   else
1222     {
1223       while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
1224         tmpa = *++a;
1225       while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
1226         tmpb = *++b;
1227
1228       while (tmpa == tmpb && ISDIGIT (tmpa))
1229         {
1230           do
1231             tmpa = *++a;
1232           while (IS_THOUSANDS_SEP (tmpa));
1233           do
1234             tmpb = *++b;
1235           while (IS_THOUSANDS_SEP (tmpb));
1236         }
1237
1238       if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1239           || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1240         return fraccompare (a, b);
1241
1242       tmp = tmpa - tmpb;
1243
1244       for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1245         do
1246           tmpa = *++a;
1247         while (IS_THOUSANDS_SEP (tmpa));
1248
1249       for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1250         do
1251           tmpb = *++b;
1252         while (IS_THOUSANDS_SEP (tmpb));
1253
1254       if (log_a != log_b)
1255         return log_a < log_b ? -1 : 1;
1256
1257       if (!log_a)
1258         return 0;
1259
1260       return tmp;
1261     }
1262 }
1263
1264 static int
1265 general_numcompare (const char *sa, const char *sb)
1266 {
1267   /* FIXME: add option to warn about failed conversions.  */
1268   /* FIXME: maybe add option to try expensive FP conversion
1269      only if A and B can't be compared more cheaply/accurately.  */
1270
1271   char *ea;
1272   char *eb;
1273   double a = strtod (sa, &ea);
1274   double b = strtod (sb, &eb);
1275
1276   /* Put conversion errors at the start of the collating sequence.  */
1277   if (sa == ea)
1278     return sb == eb ? 0 : -1;
1279   if (sb == eb)
1280     return 1;
1281
1282   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1283      conversion errors but before numbers; sort them by internal
1284      bit-pattern, for lack of a more portable alternative.  */
1285   return (a < b ? -1
1286           : a > b ? 1
1287           : a == b ? 0
1288           : b == b ? -1
1289           : a == a ? 1
1290           : memcmp ((char *) &a, (char *) &b, sizeof a));
1291 }
1292
1293 /* Return an integer in 1..12 of the month name S with length LEN.
1294    Return 0 if the name in S is not recognized.  */
1295
1296 static int
1297 getmonth (const char *s, size_t len)
1298 {
1299   char *month;
1300   register size_t i;
1301   register int lo = 0, hi = MONTHS_PER_YEAR, result;
1302
1303   while (len > 0 && blanks[UCHAR (*s)])
1304     {
1305       ++s;
1306       --len;
1307     }
1308
1309   if (len == 0)
1310     return 0;
1311
1312   month = alloca (len + 1);
1313   for (i = 0; i < len; ++i)
1314     month[i] = fold_toupper[UCHAR (s[i])];
1315   len -= trailing_blanks (month, len);
1316   month[len] = '\0';
1317
1318   do
1319     {
1320       int ix = (lo + hi) / 2;
1321
1322       if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
1323         hi = ix;
1324       else
1325         lo = ix;
1326     }
1327   while (hi - lo > 1);
1328
1329   result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
1330             ? monthtab[lo].val : 0);
1331
1332   return result;
1333 }
1334
1335 /* Compare two lines A and B trying every key in sequence until there
1336    are no more keys or a difference is found. */
1337
1338 static int
1339 keycompare (const struct line *a, const struct line *b)
1340 {
1341   struct keyfield const *key = keylist;
1342
1343   /* For the first iteration only, the key positions have been
1344      precomputed for us. */
1345   register char *texta = a->keybeg;
1346   register char *textb = b->keybeg;
1347   register char *lima = a->keylim;
1348   register char *limb = b->keylim;
1349
1350   int diff;
1351
1352   for (;;)
1353     {
1354       register char const *translate = key->translate;
1355       register bool const *ignore = key->ignore;
1356
1357       /* Find the lengths. */
1358       size_t lena = lima <= texta ? 0 : lima - texta;
1359       size_t lenb = limb <= textb ? 0 : limb - textb;
1360
1361       if (key->skipeblanks)
1362         {
1363           lena -= trailing_blanks (texta, lena);
1364           lenb -= trailing_blanks (textb, lenb);
1365         }
1366
1367       /* Actually compare the fields. */
1368       if (key->numeric | key->general_numeric)
1369         {
1370           char savea = *lima, saveb = *limb;
1371
1372           *lima = *limb = '\0';
1373           diff = ((key->numeric ? numcompare : general_numcompare)
1374                   (texta, textb));
1375           *lima = savea, *limb = saveb;
1376         }
1377       else if (key->month)
1378         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1379       /* Sorting like this may become slow, so in a simple locale the user
1380          can select a faster sort that is similar to ascii sort  */
1381       else if (HAVE_SETLOCALE && hard_LC_COLLATE)
1382         {
1383           if (ignore || translate)
1384             {
1385               char *copy_a = alloca (lena + 1 + lenb + 1);
1386               char *copy_b = copy_a + lena + 1;
1387               size_t new_len_a, new_len_b, i;
1388
1389               /* Ignore and/or translate chars before comparing.  */
1390               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1391                 {
1392                   if (i < lena)
1393                     {
1394                       copy_a[new_len_a] = (translate
1395                                            ? translate[UCHAR (texta[i])]
1396                                            : texta[i]);
1397                       if (!ignore || !ignore[UCHAR (texta[i])])
1398                         ++new_len_a;
1399                     }
1400                   if (i < lenb)
1401                     {
1402                       copy_b[new_len_b] = (translate
1403                                            ? translate[UCHAR (textb[i])]
1404                                            : textb [i]);
1405                       if (!ignore || !ignore[UCHAR (textb[i])])
1406                         ++new_len_b;
1407                     }
1408                 }
1409
1410               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1411             }
1412           else if (lena == 0)
1413             diff = - NONZERO (lenb);
1414           else if (lenb == 0)
1415             goto greater;
1416           else
1417             diff = xmemcoll (texta, lena, textb, lenb);
1418         }
1419       else if (ignore)
1420         {
1421 #define CMP_WITH_IGNORE(A, B)                                           \
1422   do                                                                    \
1423     {                                                                   \
1424           for (;;)                                                      \
1425             {                                                           \
1426               while (texta < lima && ignore[UCHAR (*texta)])            \
1427                 ++texta;                                                \
1428               while (textb < limb && ignore[UCHAR (*textb)])            \
1429                 ++textb;                                                \
1430               if (! (texta < lima && textb < limb))                     \
1431                 break;                                                  \
1432               diff = UCHAR (A) - UCHAR (B);                             \
1433               if (diff)                                                 \
1434                 goto not_equal;                                         \
1435               ++texta;                                                  \
1436               ++textb;                                                  \
1437             }                                                           \
1438                                                                         \
1439           diff = (texta < lima) - (textb < limb);                       \
1440     }                                                                   \
1441   while (0)
1442
1443           if (translate)
1444             CMP_WITH_IGNORE (translate[UCHAR (*texta)],
1445                              translate[UCHAR (*textb)]);
1446           else
1447             CMP_WITH_IGNORE (*texta, *textb);
1448         }
1449       else if (lena == 0)
1450         diff = - NONZERO (lenb);
1451       else if (lenb == 0)
1452         goto greater;
1453       else
1454         {
1455           if (translate)
1456             {
1457               while (texta < lima && textb < limb)
1458                 {
1459                   diff = (UCHAR (translate[UCHAR (*texta++)])
1460                           - UCHAR (translate[UCHAR (*textb++)]));
1461                   if (diff)
1462                     goto not_equal;
1463                 }
1464             }
1465           else
1466             {
1467               diff = memcmp (texta, textb, MIN (lena, lenb));
1468               if (diff)
1469                 goto not_equal;
1470             }
1471           diff = lena < lenb ? -1 : lena != lenb;
1472         }
1473
1474       if (diff)
1475         goto not_equal;
1476
1477       key = key->next;
1478       if (! key)
1479         break;
1480
1481       /* Find the beginning and limit of the next field.  */
1482       if (key->eword != SIZE_MAX)
1483         lima = limfield (a, key), limb = limfield (b, key);
1484       else
1485         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1486
1487       if (key->sword != SIZE_MAX)
1488         texta = begfield (a, key), textb = begfield (b, key);
1489       else
1490         {
1491           texta = a->text, textb = b->text;
1492           if (key->skipsblanks)
1493             {
1494               while (texta < lima && blanks[UCHAR (*texta)])
1495                 ++texta;
1496               while (textb < limb && blanks[UCHAR (*textb)])
1497                 ++textb;
1498             }
1499         }
1500     }
1501
1502   return 0;
1503
1504  greater:
1505   diff = 1;
1506  not_equal:
1507   return key->reverse ? -diff : diff;
1508 }
1509
1510 /* Compare two lines A and B, returning negative, zero, or positive
1511    depending on whether A compares less than, equal to, or greater than B. */
1512
1513 static int
1514 compare (register const struct line *a, register const struct line *b)
1515 {
1516   int diff;
1517   size_t alen, blen;
1518
1519   /* First try to compare on the specified keys (if any).
1520      The only two cases with no key at all are unadorned sort,
1521      and unadorned sort -r. */
1522   if (keylist)
1523     {
1524       diff = keycompare (a, b);
1525       alloca (0);
1526       if (diff | unique | stable)
1527         return diff;
1528     }
1529
1530   /* If the keys all compare equal (or no keys were specified)
1531      fall through to the default comparison.  */
1532   alen = a->length - 1, blen = b->length - 1;
1533
1534   if (alen == 0)
1535     diff = - NONZERO (blen);
1536   else if (blen == 0)
1537     diff = 1;
1538   else if (HAVE_SETLOCALE && hard_LC_COLLATE)
1539     diff = xmemcoll (a->text, alen, b->text, blen);
1540   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1541     diff = alen < blen ? -1 : alen != blen;
1542
1543   return reverse ? -diff : diff;
1544 }
1545
1546 /* Check that the lines read from FILE_NAME come in order.  Print a
1547    diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1548    false if they are not in order.  Otherwise, print no diagnostic
1549    and return true.  */
1550
1551 static bool
1552 check (char const *file_name)
1553 {
1554   FILE *fp = xfopen (file_name, "r");
1555   struct buffer buf;            /* Input buffer. */
1556   struct line temp;             /* Copy of previous line. */
1557   size_t alloc = 0;
1558   uintmax_t line_number = 0;
1559   struct keyfield const *key = keylist;
1560   bool nonunique = ! unique;
1561   bool ordered = true;
1562
1563   initbuf (&buf, sizeof (struct line),
1564            MAX (merge_buffer_size, sort_size));
1565   temp.text = NULL;
1566
1567   while (fillbuf (&buf, fp, file_name))
1568     {
1569       struct line const *line = buffer_linelim (&buf);
1570       struct line const *linebase = line - buf.nlines;
1571
1572       /* Make sure the line saved from the old buffer contents is
1573          less than or equal to the first line of the new buffer. */
1574       if (alloc && nonunique <= compare (&temp, line - 1))
1575         {
1576         found_disorder:
1577           {
1578             struct line const *disorder_line = line - 1;
1579             uintmax_t disorder_line_number =
1580               buffer_linelim (&buf) - disorder_line + line_number;
1581             char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1582             fprintf (stderr, _("%s: %s:%s: disorder: "),
1583                      program_name, file_name,
1584                      umaxtostr (disorder_line_number, hr_buf));
1585             write_bytes (disorder_line->text, disorder_line->length, stderr,
1586                          _("standard error"));
1587             ordered = false;
1588             break;
1589           }
1590         }
1591
1592       /* Compare each line in the buffer with its successor.  */
1593       while (linebase < --line)
1594         if (nonunique <= compare (line, line - 1))
1595           goto found_disorder;
1596
1597       line_number += buf.nlines;
1598
1599       /* Save the last line of the buffer.  */
1600       if (alloc < line->length)
1601         {
1602           do
1603             {
1604               alloc *= 2;
1605               if (! alloc)
1606                 {
1607                   alloc = line->length;
1608                   break;
1609                 }
1610             }
1611           while (alloc < line->length);
1612
1613           temp.text = xrealloc (temp.text, alloc);
1614         }
1615       memcpy (temp.text, line->text, line->length);
1616       temp.length = line->length;
1617       if (key)
1618         {
1619           temp.keybeg = temp.text + (line->keybeg - line->text);
1620           temp.keylim = temp.text + (line->keylim - line->text);
1621         }
1622     }
1623
1624   xfclose (fp, file_name);
1625   free (buf.buf);
1626   if (temp.text)
1627     free (temp.text);
1628   return ordered;
1629 }
1630
1631 /* Merge lines from FILES onto OFP.  NFILES cannot be greater than
1632    NMERGE.  Close input and output files before returning.
1633    OUTPUT_FILE gives the name of the output file; if OFP is NULL, the
1634    output file has not been opened yet.  */
1635
1636 static void
1637 mergefps (char **files, register int nfiles,
1638           FILE *ofp, const char *output_file)
1639 {
1640   FILE *fps[NMERGE];            /* Input streams for each file.  */
1641   struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1642   struct line saved;            /* Saved line storage for unique check. */
1643   struct line const *savedline = NULL;
1644                                 /* &saved if there is a saved line. */
1645   size_t savealloc = 0;         /* Size allocated for the saved line. */
1646   struct line const *cur[NMERGE]; /* Current line in each line table. */
1647   struct line const *base[NMERGE]; /* Base of each line table.  */
1648   int ord[NMERGE];              /* Table representing a permutation of fps,
1649                                    such that cur[ord[0]] is the smallest line
1650                                    and will be next output. */
1651   register int i, j, t;
1652   struct keyfield const *key = keylist;
1653   saved.text = NULL;
1654
1655   /* Read initial lines from each input file. */
1656   for (i = 0; i < nfiles; )
1657     {
1658       fps[i] = xfopen (files[i], "r");
1659       initbuf (&buffer[i], sizeof (struct line),
1660                MAX (merge_buffer_size, sort_size / nfiles));
1661       if (fillbuf (&buffer[i], fps[i], files[i]))
1662         {
1663           struct line const *linelim = buffer_linelim (&buffer[i]);
1664           cur[i] = linelim - 1;
1665           base[i] = linelim - buffer[i].nlines;
1666           i++;
1667         }
1668       else
1669         {
1670           /* fps[i] is empty; eliminate it from future consideration.  */
1671           xfclose (fps[i], files[i]);
1672           zaptemp (files[i]);
1673           free (buffer[i].buf);
1674           --nfiles;
1675           for (j = i; j < nfiles; ++j)
1676             files[j] = files[j + 1];
1677         }
1678     }
1679
1680   if (! ofp)
1681     ofp = xfopen (output_file, "w");
1682
1683   /* Set up the ord table according to comparisons among input lines.
1684      Since this only reorders two items if one is strictly greater than
1685      the other, it is stable. */
1686   for (i = 0; i < nfiles; ++i)
1687     ord[i] = i;
1688   for (i = 1; i < nfiles; ++i)
1689     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1690       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1691
1692   /* Repeatedly output the smallest line until no input remains. */
1693   while (nfiles)
1694     {
1695       struct line const *smallest = cur[ord[0]];
1696
1697       /* If uniquified output is turned on, output only the first of
1698          an identical series of lines. */
1699       if (unique)
1700         {
1701           if (savedline && compare (savedline, smallest))
1702             {
1703               savedline = 0;
1704               write_bytes (saved.text, saved.length, ofp, output_file);
1705             }
1706           if (!savedline)
1707             {
1708               savedline = &saved;
1709               if (savealloc < smallest->length)
1710                 {
1711                   do
1712                     if (! savealloc)
1713                       {
1714                         savealloc = smallest->length;
1715                         break;
1716                       }
1717                   while ((savealloc *= 2) < smallest->length);
1718
1719                   saved.text = xrealloc (saved.text, savealloc);
1720                 }
1721               saved.length = smallest->length;
1722               memcpy (saved.text, smallest->text, saved.length);
1723               if (key)
1724                 {
1725                   saved.keybeg =
1726                     saved.text + (smallest->keybeg - smallest->text);
1727                   saved.keylim =
1728                     saved.text + (smallest->keylim - smallest->text);
1729                 }
1730             }
1731         }
1732       else
1733         write_bytes (smallest->text, smallest->length, ofp, output_file);
1734
1735       /* Check if we need to read more lines into core. */
1736       if (base[ord[0]] < smallest)
1737         cur[ord[0]] = smallest - 1;
1738       else
1739         {
1740           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1741             {
1742               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1743               cur[ord[0]] = linelim - 1;
1744               base[ord[0]] = linelim - buffer[ord[0]].nlines;
1745             }
1746           else
1747             {
1748               /* We reached EOF on fps[ord[0]]. */
1749               for (i = 1; i < nfiles; ++i)
1750                 if (ord[i] > ord[0])
1751                   --ord[i];
1752               --nfiles;
1753               xfclose (fps[ord[0]], files[ord[0]]);
1754               zaptemp (files[ord[0]]);
1755               free (buffer[ord[0]].buf);
1756               for (i = ord[0]; i < nfiles; ++i)
1757                 {
1758                   fps[i] = fps[i + 1];
1759                   files[i] = files[i + 1];
1760                   buffer[i] = buffer[i + 1];
1761                   cur[i] = cur[i + 1];
1762                   base[i] = base[i + 1];
1763                 }
1764               for (i = 0; i < nfiles; ++i)
1765                 ord[i] = ord[i + 1];
1766               continue;
1767             }
1768         }
1769
1770       /* The new line just read in may be larger than other lines
1771          already in core; push it back in the queue until we encounter
1772          a line larger than it. */
1773       for (i = 1; i < nfiles; ++i)
1774         {
1775           t = compare (cur[ord[0]], cur[ord[i]]);
1776           if (!t)
1777             t = ord[0] - ord[i];
1778           if (t < 0)
1779             break;
1780         }
1781       t = ord[0];
1782       for (j = 1; j < i; ++j)
1783         ord[j - 1] = ord[j];
1784       ord[i - 1] = t;
1785     }
1786
1787   if (unique && savedline)
1788     {
1789       write_bytes (saved.text, saved.length, ofp, output_file);
1790       free (saved.text);
1791     }
1792
1793   xfclose (ofp, output_file);
1794 }
1795
1796 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1797    and HI (with NHI members).  T, LO, and HI point just past their
1798    respective arrays, and the arrays are in reverse order.  NLO and
1799    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
1800
1801 static inline void
1802 mergelines (struct line *t,
1803             struct line const *lo, size_t nlo,
1804             struct line const *hi, size_t nhi)
1805 {
1806   for (;;)
1807     if (compare (lo - 1, hi - 1) <= 0)
1808       {
1809         *--t = *--lo;
1810         if (! --nlo)
1811           {
1812             /* HI - NHI equalled T - (NLO + NHI) when this function
1813                began.  Therefore HI must equal T now, and there is no
1814                need to copy from HI to T.  */
1815             return;
1816           }
1817       }
1818     else
1819       {
1820         *--t = *--hi;
1821         if (! --nhi)
1822           {
1823             do
1824               *--t = *--lo;
1825             while (--nlo);
1826
1827             return;
1828           }
1829       }
1830 }
1831
1832 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1833    NLINES must be at least 2.
1834    The input and output arrays are in reverse order, and LINES and
1835    TEMP point just past the end of their respective arrays.
1836
1837    Use a recursive divide-and-conquer algorithm, in the style
1838    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
1839    the optimization suggested by exercise 5.2.4-10; this requires room
1840    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
1841    writes that this memory optimization was originally published by
1842    D. A. Bell, Comp J. 1 (1958), 75.  */
1843
1844 static void
1845 sortlines (struct line *lines, size_t nlines, struct line *temp)
1846 {
1847   if (nlines == 2)
1848     {
1849       if (0 < compare (&lines[-1], &lines[-2]))
1850         {
1851           struct line tmp = lines[-1];
1852           lines[-1] = lines[-2];
1853           lines[-2] = tmp;
1854         }
1855     }
1856   else
1857     {
1858       size_t nlo = nlines / 2;
1859       size_t nhi = nlines - nlo;
1860       struct line *lo = lines;
1861       struct line *hi = lines - nlo;
1862       struct line *sorted_lo = temp;
1863
1864       sortlines (hi, nhi, temp);
1865       if (1 < nlo)
1866         sortlines_temp (lo, nlo, sorted_lo);
1867       else
1868         sorted_lo[-1] = lo[-1];
1869
1870       mergelines (lines, sorted_lo, nlo, hi, nhi);
1871     }
1872 }
1873
1874 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1875    rather than sorting in place.  */
1876
1877 static void
1878 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1879 {
1880   if (nlines == 2)
1881     {
1882       bool swap = (0 < compare (&lines[-1], &lines[-2]));
1883       temp[-1] = lines[-1 - swap];
1884       temp[-2] = lines[-2 + swap];
1885     }
1886   else
1887     {
1888       size_t nlo = nlines / 2;
1889       size_t nhi = nlines - nlo;
1890       struct line *lo = lines;
1891       struct line *hi = lines - nlo;
1892       struct line *sorted_hi = temp - nlo;
1893
1894       sortlines_temp (hi, nhi, sorted_hi);
1895       if (1 < nlo)
1896         sortlines (lo, nlo, temp);
1897
1898       mergelines (temp, lo, nlo, sorted_hi, nhi);
1899     }
1900 }
1901
1902 /* Return the index of the first of NFILES FILES that is the same file
1903    as OUTFILE.  If none can be the same, return NFILES.  Consider an
1904    input pipe to be the same as OUTFILE, since the pipe might be the
1905    output of a command like "cat OUTFILE".  */
1906
1907 static int
1908 first_same_file (char * const *files, int nfiles, char const *outfile)
1909 {
1910   int i;
1911   bool got_outstat = false;
1912   struct stat instat, outstat;
1913
1914   for (i = 0; i < nfiles; i++)
1915     {
1916       bool standard_input = STREQ (files[i], "-");
1917
1918       if (STREQ (outfile, files[i]) && ! standard_input)
1919         return i;
1920
1921       if (! got_outstat)
1922         {
1923           got_outstat = true;
1924           if ((STREQ (outfile, "-")
1925                ? fstat (STDOUT_FILENO, &outstat)
1926                : stat (outfile, &outstat))
1927               != 0)
1928             return nfiles;
1929         }
1930
1931       if (((standard_input
1932             ? fstat (STDIN_FILENO, &instat)
1933             : stat (files[i], &instat))
1934            == 0)
1935           && (S_ISFIFO (instat.st_mode) || SAME_INODE (instat, outstat)))
1936         return i;
1937     }
1938
1939   return nfiles;
1940 }
1941
1942 /* Merge NFILES FILES onto OUTPUT_FILE.  However, merge at most
1943    MAX_MERGE input files directly onto OUTPUT_FILE.  MAX_MERGE cannot
1944    exceed NMERGE.  */
1945
1946 static void
1947 merge (char **files, int nfiles, int max_merge, char const *output_file)
1948 {
1949   while (max_merge < nfiles)
1950     {
1951       FILE *tfp;
1952       int i, t = 0;
1953       char *temp;
1954       for (i = 0; i < nfiles / NMERGE; ++i)
1955         {
1956           temp = create_temp_file (&tfp);
1957           mergefps (&files[i * NMERGE], NMERGE, tfp, temp);
1958           files[t++] = temp;
1959         }
1960       temp = create_temp_file (&tfp);
1961       mergefps (&files[i * NMERGE], nfiles % NMERGE, tfp, temp);
1962       files[t++] = temp;
1963       nfiles = t;
1964       if (nfiles == 1)
1965         break;
1966     }
1967
1968   mergefps (files, nfiles, NULL, output_file);
1969 }
1970
1971 /* Sort NFILES FILES onto OUTPUT_FILE. */
1972
1973 static void
1974 sort (char * const *files, int nfiles, char const *output_file)
1975 {
1976   struct buffer buf;
1977   int n_temp_files = 0;
1978   bool output_file_created = false;
1979
1980   buf.alloc = 0;
1981
1982   while (nfiles)
1983     {
1984       char const *temp_output;
1985       char const *file = *files;
1986       FILE *fp = xfopen (file, "r");
1987       FILE *tfp;
1988       size_t bytes_per_line = (2 * sizeof (struct line)
1989                                - sizeof (struct line) / 2);
1990
1991       if (! buf.alloc)
1992         initbuf (&buf, bytes_per_line,
1993                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
1994       buf.eof = false;
1995       files++;
1996       nfiles--;
1997
1998       while (fillbuf (&buf, fp, file))
1999         {
2000           struct line *line;
2001           struct line *linebase;
2002
2003           if (buf.eof && nfiles
2004               && (bytes_per_line + 1
2005                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2006             {
2007               /* End of file, but there is more input and buffer room.
2008                  Concatenate the next input file; this is faster in
2009                  the usual case.  */
2010               buf.left = buf.used;
2011               break;
2012             }
2013
2014           line = buffer_linelim (&buf);
2015           linebase = line - buf.nlines;
2016           if (1 < buf.nlines)
2017             sortlines (line, buf.nlines, linebase);
2018           if (buf.eof && !nfiles && !n_temp_files && !buf.left)
2019             {
2020               xfclose (fp, file);
2021               tfp = xfopen (output_file, "w");
2022               temp_output = output_file;
2023               output_file_created = true;
2024             }
2025           else
2026             {
2027               ++n_temp_files;
2028               temp_output = create_temp_file (&tfp);
2029             }
2030
2031           do
2032             {
2033               line--;
2034               write_bytes (line->text, line->length, tfp, temp_output);
2035               if (unique)
2036                 while (linebase < line && compare (line, line - 1) == 0)
2037                   line--;
2038             }
2039           while (linebase < line);
2040
2041           xfclose (tfp, temp_output);
2042
2043           if (output_file_created)
2044             goto finish;
2045         }
2046       xfclose (fp, file);
2047     }
2048
2049  finish:
2050   free (buf.buf);
2051
2052   if (! output_file_created)
2053     {
2054       int i = n_temp_files;
2055       struct tempnode *node;
2056       char **tempfiles = xnmalloc (n_temp_files, sizeof *tempfiles);
2057       for (node = temphead; i > 0; node = node->next)
2058         tempfiles[--i] = node->name;
2059       merge (tempfiles, n_temp_files, NMERGE, output_file);
2060       free (tempfiles);
2061     }
2062 }
2063
2064 /* Insert key KEY at the end of the key list.  */
2065
2066 static void
2067 insertkey (struct keyfield *key)
2068 {
2069   struct keyfield **p;
2070
2071   for (p = &keylist; *p; p = &(*p)->next)
2072     continue;
2073   *p = key;
2074   key->next = NULL;
2075 }
2076
2077 /* Report a bad field specification SPEC, with extra info MSGID.  */
2078
2079 static void badfieldspec (char const *, char const *)
2080      ATTRIBUTE_NORETURN;
2081 static void
2082 badfieldspec (char const *spec, char const *msgid)
2083 {
2084   error (SORT_FAILURE, 0, _("%s: invalid field specification `%s'"),
2085          _(msgid), spec);
2086   abort ();
2087 }
2088
2089 /* Parse the leading integer in STRING and store the resulting value
2090    (which must fit into size_t) into *VAL.  Return the address of the
2091    suffix after the integer.  If MSGID is NULL, return NULL after
2092    failure; otherwise, report MSGID and exit on failure.  */
2093
2094 static char const *
2095 parse_field_count (char const *string, size_t *val, char const *msgid)
2096 {
2097   char *suffix;
2098   uintmax_t n;
2099
2100   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2101     {
2102     case LONGINT_OK:
2103     case LONGINT_INVALID_SUFFIX_CHAR:
2104       *val = n;
2105       if (*val == n)
2106         break;
2107       /* Fall through.  */
2108     case LONGINT_OVERFLOW:
2109     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2110       if (msgid)
2111         error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2112                _(msgid), (int) (suffix - string), string);
2113       return NULL;
2114
2115     case LONGINT_INVALID:
2116       if (msgid)
2117         error (SORT_FAILURE, 0, _("%s: invalid count at start of `%s'"),
2118                _(msgid), string);
2119       return NULL;
2120     }
2121
2122   return suffix;
2123 }
2124
2125 /* Handle interrupts and hangups. */
2126
2127 static void
2128 sighandler (int sig)
2129 {
2130 #ifndef SA_NOCLDSTOP
2131   signal (sig, SIG_IGN);
2132 #endif
2133
2134   cleanup ();
2135
2136 #ifdef SA_NOCLDSTOP
2137   {
2138     struct sigaction sigact;
2139
2140     sigact.sa_handler = SIG_DFL;
2141     sigemptyset (&sigact.sa_mask);
2142     sigact.sa_flags = 0;
2143     sigaction (sig, &sigact, NULL);
2144   }
2145 #else
2146   signal (sig, SIG_DFL);
2147 #endif
2148
2149   raise (sig);
2150 }
2151
2152 /* Set the ordering options for KEY specified in S.
2153    Return the address of the first character in S that
2154    is not a valid ordering option.
2155    BLANKTYPE is the kind of blanks that 'b' should skip. */
2156
2157 static char *
2158 set_ordering (register const char *s, struct keyfield *key,
2159               enum blanktype blanktype)
2160 {
2161   while (*s)
2162     {
2163       switch (*s)
2164         {
2165         case 'b':
2166           if (blanktype == bl_start || blanktype == bl_both)
2167             key->skipsblanks = true;
2168           if (blanktype == bl_end || blanktype == bl_both)
2169             key->skipeblanks = true;
2170           break;
2171         case 'd':
2172           key->ignore = nondictionary;
2173           break;
2174         case 'f':
2175           key->translate = fold_toupper;
2176           break;
2177         case 'g':
2178           key->general_numeric = true;
2179           break;
2180         case 'i':
2181           /* Option order should not matter, so don't let -i override
2182              -d.  -d implies -i, but -i does not imply -d.  */
2183           if (! key->ignore)
2184             key->ignore = nonprinting;
2185           break;
2186         case 'M':
2187           key->month = true;
2188           break;
2189         case 'n':
2190           key->numeric = true;
2191           break;
2192         case 'r':
2193           key->reverse = true;
2194           break;
2195         default:
2196           return (char *) s;
2197         }
2198       ++s;
2199     }
2200   return (char *) s;
2201 }
2202
2203 static struct keyfield *
2204 new_key (void)
2205 {
2206   struct keyfield *key = xzalloc (sizeof *key);
2207   key->eword = SIZE_MAX;
2208   return key;
2209 }
2210
2211 int
2212 main (int argc, char **argv)
2213 {
2214   struct keyfield *key;
2215   struct keyfield gkey;
2216   char const *s;
2217   int c = 0;
2218   bool checkonly = false;
2219   bool mergeonly = false;
2220   int nfiles = 0;
2221   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2222   bool obsolete_usage = (posix2_version () < 200112);
2223   char const *short_options = (obsolete_usage
2224                                ? COMMON_SHORT_OPTIONS "y::"
2225                                : COMMON_SHORT_OPTIONS "y:");
2226   char *minus = "-", **files;
2227   char const *outfile = minus;
2228   static int const sigs[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2229   unsigned int nsigs = sizeof sigs / sizeof *sigs;
2230 #ifdef SA_NOCLDSTOP
2231   struct sigaction oldact, newact;
2232 #endif
2233
2234   initialize_main (&argc, &argv);
2235   program_name = argv[0];
2236   setlocale (LC_ALL, "");
2237   bindtextdomain (PACKAGE, LOCALEDIR);
2238   textdomain (PACKAGE);
2239
2240   atexit (cleanup);
2241
2242   initialize_exit_failure (SORT_FAILURE);
2243   atexit (close_stdout);
2244
2245   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2246 #if HAVE_NL_LANGINFO
2247   hard_LC_TIME = hard_locale (LC_TIME);
2248 #endif
2249
2250 #if HAVE_SETLOCALE
2251   /* Let's get locale's representation of the decimal point */
2252   {
2253     struct lconv const *lconvp = localeconv ();
2254
2255     /* If the locale doesn't define a decimal point, or if the decimal
2256        point is multibyte, use the C decimal point.  We don't support
2257        multibyte decimal points yet.  */
2258     decimal_point = *lconvp->decimal_point;
2259     if (! decimal_point || lconvp->decimal_point[1])
2260       decimal_point = C_DECIMAL_POINT;
2261
2262     /* We don't support multibyte thousands separators yet.  */
2263     th_sep = *lconvp->thousands_sep;
2264     if (! th_sep || lconvp->thousands_sep[1])
2265       th_sep = CHAR_MAX + 1;
2266   }
2267 #endif
2268
2269   have_read_stdin = false;
2270   inittables ();
2271
2272 #ifdef SA_NOCLDSTOP
2273   {
2274     unsigned int i;
2275     sigemptyset (&caught_signals);
2276     for (i = 0; i < nsigs; i++)
2277       sigaddset (&caught_signals, sigs[i]);
2278     newact.sa_handler = sighandler;
2279     newact.sa_mask = caught_signals;
2280     newact.sa_flags = 0;
2281   }
2282 #endif
2283
2284   {
2285     unsigned int i;
2286     for (i = 0; i < nsigs; i++)
2287       {
2288         int sig = sigs[i];
2289 #ifdef SA_NOCLDSTOP
2290         sigaction (sig, NULL, &oldact);
2291         if (oldact.sa_handler != SIG_IGN)
2292           sigaction (sig, &newact, NULL);
2293 #else
2294         if (signal (sig, SIG_IGN) != SIG_IGN)
2295           signal (sig, sighandler);
2296 #endif
2297       }
2298   }
2299
2300   gkey.sword = gkey.eword = SIZE_MAX;
2301   gkey.ignore = NULL;
2302   gkey.translate = NULL;
2303   gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
2304   gkey.skipsblanks = gkey.skipeblanks = false;
2305
2306   files = xnmalloc (argc, sizeof *files);
2307
2308   for (;;)
2309     {
2310       /* Parse an operand as a file after "--" was seen; or if
2311          pedantic and a file was seen, unless the POSIX version
2312          predates 1003.1-2001 and -c was not seen and the operand is
2313          "-o FILE" or "-oFILE".  */
2314
2315       if (c == -1
2316           || (posixly_correct && nfiles != 0
2317               && ! (obsolete_usage
2318                     && ! checkonly
2319                     && optind != argc
2320                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
2321                     && (argv[optind][2] || optind + 1 != argc)))
2322           || ((c = getopt_long (argc, argv, short_options,
2323                                 long_options, NULL))
2324               == -1))
2325         {
2326           if (argc <= optind)
2327             break;
2328           files[nfiles++] = argv[optind++];
2329         }
2330       else switch (c)
2331         {
2332         case 1:
2333           key = NULL;
2334           if (obsolete_usage && optarg[0] == '+')
2335             {
2336               /* Treat +POS1 [-POS2] as a key if possible; but silently
2337                  treat an operand as a file if it is not a valid +POS1.  */
2338               key = new_key ();
2339               s = parse_field_count (optarg + 1, &key->sword, NULL);
2340               if (s && *s == '.')
2341                 s = parse_field_count (s + 1, &key->schar, NULL);
2342               if (! (key->sword | key->schar))
2343                 key->sword = SIZE_MAX;
2344               if (! s || *set_ordering (s, key, bl_start))
2345                 {
2346                   free (key);
2347                   key = NULL;
2348                 }
2349               else
2350                 {
2351                   if (optind != argc && argv[optind][0] == '-'
2352                       && ISDIGIT (argv[optind][1]))
2353                     {
2354                       char const *optarg1 = argv[optind++];
2355                       s = parse_field_count (optarg1 + 1, &key->eword,
2356                                              N_("invalid number after `-'"));
2357                       if (*s == '.')
2358                         s = parse_field_count (s + 1, &key->echar,
2359                                                N_("invalid number after `.'"));
2360                       if (*set_ordering (s, key, bl_end))
2361                         badfieldspec (optarg1,
2362                                       N_("stray character in field spec"));
2363                     }
2364                   insertkey (key);
2365                 }
2366             }
2367           if (! key)
2368             files[nfiles++] = optarg;
2369           break;
2370
2371         case 'b':
2372         case 'd':
2373         case 'f':
2374         case 'g':
2375         case 'i':
2376         case 'M':
2377         case 'n':
2378         case 'r':
2379           {
2380             char str[2];
2381             str[0] = c;
2382             str[1] = '\0';
2383             set_ordering (str, &gkey, bl_both);
2384           }
2385           break;
2386
2387         case 'c':
2388           checkonly = true;
2389           break;
2390
2391         case 'k':
2392           key = new_key ();
2393
2394           /* Get POS1. */
2395           s = parse_field_count (optarg, &key->sword,
2396                                  N_("invalid number at field start"));
2397           if (! key->sword--)
2398             {
2399               /* Provoke with `sort -k0' */
2400               badfieldspec (optarg, N_("field number is zero"));
2401             }
2402           if (*s == '.')
2403             {
2404               s = parse_field_count (s + 1, &key->schar,
2405                                      N_("invalid number after `.'"));
2406               if (! key->schar--)
2407                 {
2408                   /* Provoke with `sort -k1.0' */
2409                   badfieldspec (optarg, N_("character offset is zero"));
2410                 }
2411             }
2412           if (! (key->sword | key->schar))
2413             key->sword = SIZE_MAX;
2414           s = set_ordering (s, key, bl_start);
2415           if (*s != ',')
2416             {
2417               key->eword = SIZE_MAX;
2418               key->echar = 0;
2419             }
2420           else
2421             {
2422               /* Get POS2. */
2423               s = parse_field_count (s + 1, &key->eword,
2424                                      N_("invalid number after `,'"));
2425               if (! key->eword--)
2426                 {
2427                   /* Provoke with `sort -k1,0' */
2428                   badfieldspec (optarg, N_("field number is zero"));
2429                 }
2430               if (*s == '.')
2431                 s = parse_field_count (s + 1, &key->echar,
2432                                        N_("invalid number after `.'"));
2433               else
2434                 {
2435                   /* `-k 2,3' is equivalent to `+1 -3'.  */
2436                   key->eword++;
2437                 }
2438               s = set_ordering (s, key, bl_end);
2439             }
2440           if (*s)
2441             badfieldspec (optarg, N_("stray character in field spec"));
2442           insertkey (key);
2443           break;
2444
2445         case 'm':
2446           mergeonly = true;
2447           break;
2448
2449         case 'o':
2450           if (outfile != minus && strcmp (outfile, optarg) != 0)
2451             error (SORT_FAILURE, 0, _("multiple output files specified"));
2452           outfile = optarg;
2453           break;
2454
2455         case 's':
2456           stable = true;
2457           break;
2458
2459         case 'S':
2460           specify_sort_size (optarg);
2461           break;
2462
2463         case 't':
2464           {
2465             int newtab = optarg[0];
2466             if (! newtab)
2467               error (SORT_FAILURE, 0, _("empty tab"));
2468             if (optarg[1])
2469               {
2470                 if (strcmp (optarg, "\\0") == 0)
2471                   newtab = '\0';
2472                 else
2473                   {
2474                     /* Provoke with `sort -txx'.  Complain about
2475                        "multi-character tab" instead of "multibyte tab", so
2476                        that the diagnostic's wording does not need to be
2477                        changed once multibyte characters are supported.  */
2478                     error (SORT_FAILURE, 0, _("multi-character tab `%s'"),
2479                            optarg);
2480                   }
2481               }
2482             if (tab != TAB_DEFAULT && tab != newtab)
2483               error (SORT_FAILURE, 0, _("incompatible tabs"));
2484             tab = newtab;
2485           }
2486           break;
2487
2488         case 'T':
2489           add_temp_dir (optarg);
2490           break;
2491
2492         case 'u':
2493           unique = true;
2494           break;
2495
2496         case 'y':
2497           /* Accept and ignore e.g. -y0 for compatibility with Solaris
2498              2.x through Solaris 7.  -y is marked as obsolete starting
2499              with Solaris 8.  */
2500           break;
2501
2502         case 'z':
2503           eolchar = 0;
2504           break;
2505
2506         case_GETOPT_HELP_CHAR;
2507
2508         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2509
2510         default:
2511           usage (SORT_FAILURE);
2512         }
2513     }
2514
2515   /* Inheritance of global options to individual keys. */
2516   for (key = keylist; key; key = key->next)
2517     if (! (key->ignore || key->translate
2518            || (key->skipsblanks | key->reverse
2519                | key->skipeblanks | key->month | key->numeric
2520                | key->general_numeric)))
2521       {
2522         key->ignore = gkey.ignore;
2523         key->translate = gkey.translate;
2524         key->skipsblanks = gkey.skipsblanks;
2525         key->skipeblanks = gkey.skipeblanks;
2526         key->month = gkey.month;
2527         key->numeric = gkey.numeric;
2528         key->general_numeric = gkey.general_numeric;
2529         key->reverse = gkey.reverse;
2530       }
2531
2532   if (!keylist && (gkey.ignore || gkey.translate
2533                    || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2534                        | gkey.numeric | gkey.general_numeric)))
2535     insertkey (&gkey);
2536   reverse = gkey.reverse;
2537
2538   if (temp_dir_count == 0)
2539     {
2540       char const *tmp_dir = getenv ("TMPDIR");
2541       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2542     }
2543
2544   if (nfiles == 0)
2545     {
2546       nfiles = 1;
2547       files = &minus;
2548     }
2549
2550   if (checkonly)
2551     {
2552       if (nfiles > 1)
2553         error (SORT_FAILURE, 0, _("extra operand `%s' not allowed with -c"),
2554                files[1]);
2555
2556       /* POSIX requires that sort return 1 IFF invoked with -c and the
2557          input is not properly sorted.  */
2558       exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2559     }
2560
2561   if (mergeonly)
2562     {
2563       int max_merge = first_same_file (files, MIN (nfiles, NMERGE), outfile);
2564       merge (files, nfiles, max_merge, outfile);
2565     }
2566   else
2567     sort (files, nfiles, outfile);
2568
2569   if (have_read_stdin && fclose (stdin) == EOF)
2570     die (_("close failed"), "-");
2571
2572   exit (EXIT_SUCCESS);
2573 }