src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 1986, 1991, 1995-2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18 \f
  19 #include <config.h>
  20
  21 #include <getopt.h>
  22 #include <sys/types.h>
  23
  24 #include "system.h"
  25 #include "argmatch.h"
  26 #include "linebuffer.h"
  27 #include "error.h"
  28 #include "fadvise.h"
  29 #include "hard-locale.h"
  30 #include "posixver.h"
  31 #include "quote.h"
  32 #include "stdio--.h"
  33 #include "xmemcoll.h"
  34 #include "xstrtol.h"
  35 #include "memcasecmp.h"
  36
  37 /* The official name of this program (e.g., no `g' prefix).  */
  38 #define PROGRAM_NAME "uniq"
  39
  40 #define AUTHORS \
  41   proper_name ("Richard M. Stallman"), \
  42   proper_name ("David MacKenzie")
  43
  44 #define SWAP_LINES(A, B)                        \
  45   do                                            \
  46     {                                           \
  47       struct linebuffer *_tmp;                  \
  48       _tmp = (A);                               \
  49       (A) = (B);                                \
  50       (B) = _tmp;                               \
  51     }                                           \
  52   while (0)
  53
  54 /* True if the LC_COLLATE locale is hard.  */
  55 static bool hard_LC_COLLATE;
  56
  57 /* Number of fields to skip on each line when doing comparisons. */
  58 static size_t skip_fields;
  59
  60 /* Number of chars to skip after skipping any fields. */
  61 static size_t skip_chars;
  62
  63 /* Number of chars to compare. */
  64 static size_t check_chars;
  65
  66 enum countmode
  67 {
  68   count_occurrences,            /* -c Print count before output lines. */
  69   count_none                    /* Default.  Do not print counts. */
  70 };
  71
  72 /* Whether and how to precede the output lines with a count of the number of
  73    times they occurred in the input. */
  74 static enum countmode countmode;
  75
  76 /* Which lines to output: unique lines, the first of a group of
  77    repeated lines, and the second and subsequented of a group of
  78    repeated lines.  */
  79 static bool output_unique;
  80 static bool output_first_repeated;
  81 static bool output_later_repeated;
  82
  83 /* If true, ignore case when comparing.  */
  84 static bool ignore_case;
  85
  86 enum delimit_method
  87 {
  88   /* No delimiters output.  --all-repeated[=none] */
  89   DM_NONE,
  90
  91   /* Delimiter precedes all groups.  --all-repeated=prepend */
  92   DM_PREPEND,
  93
  94   /* Delimit all groups.  --all-repeated=separate */
  95   DM_SEPARATE
  96 };
  97
  98 static char const *const delimit_method_string[] =
  99 {
 100   "none", "prepend", "separate", NULL
 101 };
 102
 103 static enum delimit_method const delimit_method_map[] =
 104 {
 105   DM_NONE, DM_PREPEND, DM_SEPARATE
 106 };
 107
 108 /* Select whether/how to delimit groups of duplicate lines.  */
 109 static enum delimit_method delimit_groups;
 110
 111 static struct option const longopts[] =
 112 {
 113   {"count", no_argument, NULL, 'c'},
 114   {"repeated", no_argument, NULL, 'd'},
 115   {"all-repeated", optional_argument, NULL, 'D'},
 116   {"ignore-case", no_argument, NULL, 'i'},
 117   {"unique", no_argument, NULL, 'u'},
 118   {"skip-fields", required_argument, NULL, 'f'},
 119   {"skip-chars", required_argument, NULL, 's'},
 120   {"check-chars", required_argument, NULL, 'w'},
 121   {"zero-terminated", no_argument, NULL, 'z'},
 122   {GETOPT_HELP_OPTION_DECL},
 123   {GETOPT_VERSION_OPTION_DECL},
 124   {NULL, 0, NULL, 0}
 125 };
 126
 127 void
 128 usage (int status)
 129 {
 130   if (status != EXIT_SUCCESS)
 131     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 132              program_name);
 133   else
 134     {
 135       printf (_("\
 136 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 137 "),
 138               program_name);
 139       fputs (_("\
 140 Filter adjacent matching lines from INPUT (or standard input),\n\
 141 writing to OUTPUT (or standard output).\n\
 142 \n\
 143 With no options, matching lines are merged to the first occurrence.\n\
 144 \n\
 145 "), stdout);
 146      fputs (_("\
 147 Mandatory arguments to long options are mandatory for short options too.\n\
 148 "), stdout);
 149      fputs (_("\
 150   -c, --count           prefix lines by the number of occurrences\n\
 151   -d, --repeated        only print duplicate lines\n\
 152 "), stdout);
 153      fputs (_("\
 154   -D, --all-repeated[=delimit-method]  print all duplicate lines\n\
 155                         delimit-method={none(default),prepend,separate}\n\
 156                         Delimiting is done with blank lines\n\
 157   -f, --skip-fields=N   avoid comparing the first N fields\n\
 158   -i, --ignore-case     ignore differences in case when comparing\n\
 159   -s, --skip-chars=N    avoid comparing the first N characters\n\
 160   -u, --unique          only print unique lines\n\
 161   -z, --zero-terminated  end lines with 0 byte, not newline\n\
 162 "), stdout);
 163      fputs (_("\
 164   -w, --check-chars=N   compare no more than N characters in lines\n\
 165 "), stdout);
 166      fputs (HELP_OPTION_DESCRIPTION, stdout);
 167      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 168      fputs (_("\
 169 \n\
 170 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 171 characters.  Fields are skipped before chars.\n\
 172 "), stdout);
 173      fputs (_("\
 174 \n\
 175 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 176 You may want to sort the input first, or use `sort -u' without `uniq'.\n\
 177 Also, comparisons honor the rules specified by `LC_COLLATE'.\n\
 178 "), stdout);
 179       emit_ancillary_info ();
 180     }
 181   exit (status);
 182 }
 183
 184 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 185    invalid.  Silently convert too-large values to SIZE_MAX.  */
 186
 187 static size_t
 188 size_opt (char const *opt, char const *msgid)
 189 {
 190   unsigned long int size;
 191   verify (SIZE_MAX <= ULONG_MAX);
 192
 193   switch (xstrtoul (opt, NULL, 10, &size, ""))
 194     {
 195     case LONGINT_OK:
 196     case LONGINT_OVERFLOW:
 197       break;
 198
 199     default:
 200       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 201     }
 202
 203   return MIN (size, SIZE_MAX);
 204 }
 205
 206 /* Given a linebuffer LINE,
 207    return a pointer to the beginning of the line's field to be compared. */
 208
 209 static char *
 210 find_field (struct linebuffer const *line)
 211 {
 212   size_t count;
 213   char const *lp = line->buffer;
 214   size_t size = line->length - 1;
 215   size_t i = 0;
 216
 217   for (count = 0; count < skip_fields; count++)
 218     {
 219       while (i < size && isblank (to_uchar (lp[i])))
 220         i++;
 221       while (i < size && !isblank (to_uchar (lp[i])))
 222         i++;
 223     }
 224
 225   for (count = 0; count < skip_chars && i < size; count++)
 226     i++;
 227
 228   return line->buffer + i;
 229 }
 230
 231 /* Return false if two strings OLD and NEW match, true if not.
 232    OLD and NEW point not to the beginnings of the lines
 233    but rather to the beginnings of the fields to compare.
 234    OLDLEN and NEWLEN are their lengths. */
 235
 236 static bool
 237 different (char *old, char *new, size_t oldlen, size_t newlen)
 238 {
 239   if (check_chars < oldlen)
 240     oldlen = check_chars;
 241   if (check_chars < newlen)
 242     newlen = check_chars;
 243
 244   if (ignore_case)
 245     {
 246       /* FIXME: This should invoke strcoll somehow.  */
 247       return oldlen != newlen || memcasecmp (old, new, oldlen);
 248     }
 249   else if (hard_LC_COLLATE)
 250     return xmemcoll (old, oldlen, new, newlen) != 0;
 251   else
 252     return oldlen != newlen || memcmp (old, new, oldlen);
 253 }
 254
 255 /* Output the line in linebuffer LINE to standard output
 256    provided that the switches say it should be output.
 257    MATCH is true if the line matches the previous line.
 258    If requested, print the number of times it occurred, as well;
 259    LINECOUNT + 1 is the number of times that the line occurred. */
 260
 261 static void
 262 writeline (struct linebuffer const *line,
 263            bool match, uintmax_t linecount)
 264 {
 265   if (! (linecount == 0 ? output_unique
 266          : !match ? output_first_repeated
 267          : output_later_repeated))
 268     return;
 269
 270   if (countmode == count_occurrences)
 271     printf ("%7" PRIuMAX " ", linecount + 1);
 272
 273   fwrite (line->buffer, sizeof (char), line->length, stdout);
 274 }
 275
 276 /* Process input file INFILE with output to OUTFILE.
 277    If either is "-", use the standard I/O stream for it instead. */
 278
 279 static void
 280 check_file (const char *infile, const char *outfile, char delimiter)
 281 {
 282   struct linebuffer lb1, lb2;
 283   struct linebuffer *thisline, *prevline;
 284
 285   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 286     error (EXIT_FAILURE, errno, "%s", infile);
 287   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 288     error (EXIT_FAILURE, errno, "%s", outfile);
 289
 290   fadvise (stdin, FADVISE_SEQUENTIAL);
 291
 292   thisline = &lb1;
 293   prevline = &lb2;
 294
 295   initbuffer (thisline);
 296   initbuffer (prevline);
 297
 298   /* The duplication in the following `if' and `else' blocks is an
 299      optimization to distinguish the common case (in which none of
 300      the following options has been specified: --count, -repeated,
 301      --all-repeated, --unique) from the others.  In the common case,
 302      this optimization lets uniq output each different line right away,
 303      without waiting to see if the next one is different.  */
 304
 305   if (output_unique && output_first_repeated && countmode == count_none)
 306     {
 307       char *prevfield IF_LINT ( = NULL);
 308       size_t prevlen IF_LINT ( = 0);
 309
 310       while (!feof (stdin))
 311         {
 312           char *thisfield;
 313           size_t thislen;
 314           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 315             break;
 316           thisfield = find_field (thisline);
 317           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 318           if (prevline->length == 0
 319               || different (thisfield, prevfield, thislen, prevlen))
 320             {
 321               fwrite (thisline->buffer, sizeof (char),
 322                       thisline->length, stdout);
 323
 324               SWAP_LINES (prevline, thisline);
 325               prevfield = thisfield;
 326               prevlen = thislen;
 327             }
 328         }
 329     }
 330   else
 331     {
 332       char *prevfield;
 333       size_t prevlen;
 334       uintmax_t match_count = 0;
 335       bool first_delimiter = true;
 336
 337       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 338         goto closefiles;
 339       prevfield = find_field (prevline);
 340       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 341
 342       while (!feof (stdin))
 343         {
 344           bool match;
 345           char *thisfield;
 346           size_t thislen;
 347           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 348             {
 349               if (ferror (stdin))
 350                 goto closefiles;
 351               break;
 352             }
 353           thisfield = find_field (thisline);
 354           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 355           match = !different (thisfield, prevfield, thislen, prevlen);
 356           match_count += match;
 357
 358           if (match_count == UINTMAX_MAX)
 359             {
 360               if (count_occurrences)
 361                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 362               match_count--;
 363             }
 364
 365           if (delimit_groups != DM_NONE)
 366             {
 367               if (!match)
 368                 {
 369                   if (match_count) /* a previous match */
 370                     first_delimiter = false; /* Only used when DM_SEPARATE */
 371                 }
 372               else if (match_count == 1)
 373                 {
 374                   if ((delimit_groups == DM_PREPEND)
 375                       || (delimit_groups == DM_SEPARATE
 376                           && !first_delimiter))
 377                     putchar (delimiter);
 378                 }
 379             }
 380
 381           if (!match || output_later_repeated)
 382             {
 383               writeline (prevline, match, match_count);
 384               SWAP_LINES (prevline, thisline);
 385               prevfield = thisfield;
 386               prevlen = thislen;
 387               if (!match)
 388                 match_count = 0;
 389             }
 390         }
 391
 392       writeline (prevline, false, match_count);
 393     }
 394
 395  closefiles:
 396   if (ferror (stdin) || fclose (stdin) != 0)
 397     error (EXIT_FAILURE, 0, _("error reading %s"), infile);
 398
 399   /* stdout is handled via the atexit-invoked close_stdout function.  */
 400
 401   free (lb1.buffer);
 402   free (lb2.buffer);
 403 }
 404
 405 enum Skip_field_option_type
 406   {
 407     SFO_NONE,
 408     SFO_OBSOLETE,
 409     SFO_NEW
 410   };
 411
 412 int
 413 main (int argc, char **argv)
 414 {
 415   int optc = 0;
 416   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 417   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 418   int nfiles = 0;
 419   char const *file[2];
 420   char delimiter = '\n';        /* change with --zero-terminated, -z */
 421
 422   file[0] = file[1] = "-";
 423   initialize_main (&argc, &argv);
 424   set_program_name (argv[0]);
 425   setlocale (LC_ALL, "");
 426   bindtextdomain (PACKAGE, LOCALEDIR);
 427   textdomain (PACKAGE);
 428   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 429
 430   atexit (close_stdout);
 431
 432   skip_chars = 0;
 433   skip_fields = 0;
 434   check_chars = SIZE_MAX;
 435   output_unique = output_first_repeated = true;
 436   output_later_repeated = false;
 437   countmode = count_none;
 438   delimit_groups = DM_NONE;
 439
 440   while (true)
 441     {
 442       /* Parse an operand with leading "+" as a file after "--" was
 443          seen; or if pedantic and a file was seen; or if not
 444          obsolete.  */
 445
 446       if (optc == -1
 447           || (posixly_correct && nfiles != 0)
 448           || ((optc = getopt_long (argc, argv,
 449                                    "-0123456789Dcdf:is:uw:z", longopts, NULL))
 450               == -1))
 451         {
 452           if (argc <= optind)
 453             break;
 454           if (nfiles == 2)
 455             {
 456               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 457               usage (EXIT_FAILURE);
 458             }
 459           file[nfiles++] = argv[optind++];
 460         }
 461       else switch (optc)
 462         {
 463         case 1:
 464           {
 465             unsigned long int size;
 466             if (optarg[0] == '+'
 467                 && posix2_version () < 200112
 468                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 469                 && size <= SIZE_MAX)
 470               skip_chars = size;
 471             else if (nfiles == 2)
 472               {
 473                 error (0, 0, _("extra operand %s"), quote (optarg));
 474                 usage (EXIT_FAILURE);
 475               }
 476             else
 477               file[nfiles++] = optarg;
 478           }
 479           break;
 480
 481         case '0':
 482         case '1':
 483         case '2':
 484         case '3':
 485         case '4':
 486         case '5':
 487         case '6':
 488         case '7':
 489         case '8':
 490         case '9':
 491           {
 492             if (skip_field_option_type == SFO_NEW)
 493               skip_fields = 0;
 494
 495             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 496               skip_fields = SIZE_MAX;
 497
 498             skip_field_option_type = SFO_OBSOLETE;
 499           }
 500           break;
 501
 502         case 'c':
 503           countmode = count_occurrences;
 504           break;
 505
 506         case 'd':
 507           output_unique = false;
 508           break;
 509
 510         case 'D':
 511           output_unique = false;
 512           output_later_repeated = true;
 513           if (optarg == NULL)
 514             delimit_groups = DM_NONE;
 515           else
 516             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 517                                         delimit_method_string,
 518                                         delimit_method_map);
 519           break;
 520
 521         case 'f':
 522           skip_field_option_type = SFO_NEW;
 523           skip_fields = size_opt (optarg,
 524                                   N_("invalid number of fields to skip"));
 525           break;
 526
 527         case 'i':
 528           ignore_case = true;
 529           break;
 530
 531         case 's':
 532           skip_chars = size_opt (optarg,
 533                                  N_("invalid number of bytes to skip"));
 534           break;
 535
 536         case 'u':
 537           output_first_repeated = false;
 538           break;
 539
 540         case 'w':
 541           check_chars = size_opt (optarg,
 542                                   N_("invalid number of bytes to compare"));
 543           break;
 544
 545         case 'z':
 546           delimiter = '\0';
 547           break;
 548
 549         case_GETOPT_HELP_CHAR;
 550
 551         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 552
 553         default:
 554           usage (EXIT_FAILURE);
 555         }
 556     }
 557
 558   if (countmode == count_occurrences && output_later_repeated)
 559     {
 560       error (0, 0,
 561            _("printing all duplicated lines and repeat counts is meaningless"));
 562       usage (EXIT_FAILURE);
 563     }
 564
 565   check_file (file[0], file[1], delimiter);
 566
 567   exit (EXIT_SUCCESS);
 568 }