src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 1986-2015 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18 \f
  19 #include <config.h>
  20
  21 #include <getopt.h>
  22 #include <sys/types.h>
  23
  24 #include "system.h"
  25 #include "argmatch.h"
  26 #include "linebuffer.h"
  27 #include "error.h"
  28 #include "fadvise.h"
  29 #include "hard-locale.h"
  30 #include "posixver.h"
  31 #include "quote.h"
  32 #include "stdio--.h"
  33 #include "xmemcoll.h"
  34 #include "xstrtol.h"
  35 #include "memcasecmp.h"
  36
  37 /* The official name of this program (e.g., no 'g' prefix).  */
  38 #define PROGRAM_NAME "uniq"
  39
  40 #define AUTHORS \
  41   proper_name ("Richard M. Stallman"), \
  42   proper_name ("David MacKenzie")
  43
  44 #define SWAP_LINES(A, B)                        \
  45   do                                            \
  46     {                                           \
  47       struct linebuffer *_tmp;                  \
  48       _tmp = (A);                               \
  49       (A) = (B);                                \
  50       (B) = _tmp;                               \
  51     }                                           \
  52   while (0)
  53
  54 /* True if the LC_COLLATE locale is hard.  */
  55 static bool hard_LC_COLLATE;
  56
  57 /* Number of fields to skip on each line when doing comparisons. */
  58 static size_t skip_fields;
  59
  60 /* Number of chars to skip after skipping any fields. */
  61 static size_t skip_chars;
  62
  63 /* Number of chars to compare. */
  64 static size_t check_chars;
  65
  66 enum countmode
  67 {
  68   count_occurrences,            /* -c Print count before output lines. */
  69   count_none                    /* Default.  Do not print counts. */
  70 };
  71
  72 /* Whether and how to precede the output lines with a count of the number of
  73    times they occurred in the input. */
  74 static enum countmode countmode;
  75
  76 /* Which lines to output: unique lines, the first of a group of
  77    repeated lines, and the second and subsequented of a group of
  78    repeated lines.  */
  79 static bool output_unique;
  80 static bool output_first_repeated;
  81 static bool output_later_repeated;
  82
  83 /* If true, ignore case when comparing.  */
  84 static bool ignore_case;
  85
  86 enum delimit_method
  87 {
  88   /* No delimiters output.  --all-repeated[=none] */
  89   DM_NONE,
  90
  91   /* Delimiter precedes all groups.  --all-repeated=prepend */
  92   DM_PREPEND,
  93
  94   /* Delimit all groups.  --all-repeated=separate */
  95   DM_SEPARATE
  96 };
  97
  98 static char const *const delimit_method_string[] =
  99 {
 100   "none", "prepend", "separate", NULL
 101 };
 102
 103 static enum delimit_method const delimit_method_map[] =
 104 {
 105   DM_NONE, DM_PREPEND, DM_SEPARATE
 106 };
 107
 108 /* Select whether/how to delimit groups of duplicate lines.  */
 109 static enum delimit_method delimit_groups;
 110
 111 enum grouping_method
 112 {
 113   /* No grouping, when "--group" isn't used */
 114   GM_NONE,
 115
 116   /* Delimiter preceges all groups.  --group=prepend */
 117   GM_PREPEND,
 118
 119   /* Delimiter follows all groups.   --group=append */
 120   GM_APPEND,
 121
 122   /* Delimiter between groups.    --group[=separate] */
 123   GM_SEPARATE,
 124
 125   /* Delimiter before and after each group. --group=both */
 126   GM_BOTH
 127 };
 128
 129 static char const *const grouping_method_string[] =
 130 {
 131   "prepend", "append", "separate", "both", NULL
 132 };
 133
 134 static enum grouping_method const grouping_method_map[] =
 135 {
 136   GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
 137 };
 138
 139 static enum grouping_method grouping = GM_NONE;
 140
 141 enum
 142 {
 143   GROUP_OPTION = CHAR_MAX + 1
 144 };
 145
 146 static struct option const longopts[] =
 147 {
 148   {"count", no_argument, NULL, 'c'},
 149   {"repeated", no_argument, NULL, 'd'},
 150   {"all-repeated", optional_argument, NULL, 'D'},
 151   {"group", optional_argument, NULL, GROUP_OPTION},
 152   {"ignore-case", no_argument, NULL, 'i'},
 153   {"unique", no_argument, NULL, 'u'},
 154   {"skip-fields", required_argument, NULL, 'f'},
 155   {"skip-chars", required_argument, NULL, 's'},
 156   {"check-chars", required_argument, NULL, 'w'},
 157   {"zero-terminated", no_argument, NULL, 'z'},
 158   {GETOPT_HELP_OPTION_DECL},
 159   {GETOPT_VERSION_OPTION_DECL},
 160   {NULL, 0, NULL, 0}
 161 };
 162
 163 void
 164 usage (int status)
 165 {
 166   if (status != EXIT_SUCCESS)
 167     emit_try_help ();
 168   else
 169     {
 170       printf (_("\
 171 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 172 "),
 173               program_name);
 174       fputs (_("\
 175 Filter adjacent matching lines from INPUT (or standard input),\n\
 176 writing to OUTPUT (or standard output).\n\
 177 \n\
 178 With no options, matching lines are merged to the first occurrence.\n\
 179 "), stdout);
 180
 181       emit_mandatory_arg_note ();
 182
 183      fputs (_("\
 184   -c, --count           prefix lines by the number of occurrences\n\
 185   -d, --repeated        only print duplicate lines, one for each group\n\
 186 "), stdout);
 187      fputs (_("\
 188   -D                    print all duplicate lines\n\
 189       --all-repeated[=METHOD]  like -D, but allow separating groups\n\
 190                                  with an empty line;\n\
 191                                  METHOD={none(default),prepend,separate}\n\
 192 "), stdout);
 193      fputs (_("\
 194   -f, --skip-fields=N   avoid comparing the first N fields\n\
 195 "), stdout);
 196      fputs (_("\
 197       --group[=METHOD]  show all items, separating groups with an empty line;\n\
 198                           METHOD={separate(default),prepend,append,both}\n\
 199 "), stdout);
 200      fputs (_("\
 201   -i, --ignore-case     ignore differences in case when comparing\n\
 202   -s, --skip-chars=N    avoid comparing the first N characters\n\
 203   -u, --unique          only print unique lines\n\
 204 "), stdout);
 205       fputs (_("\
 206   -z, --zero-terminated     line delimiter is NUL, not newline\n\
 207 "), stdout);
 208      fputs (_("\
 209   -w, --check-chars=N   compare no more than N characters in lines\n\
 210 "), stdout);
 211      fputs (HELP_OPTION_DESCRIPTION, stdout);
 212      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 213      fputs (_("\
 214 \n\
 215 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 216 characters.  Fields are skipped before chars.\n\
 217 "), stdout);
 218      fputs (_("\
 219 \n\
 220 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 221 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
 222 Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
 223 "), stdout);
 224       emit_ancillary_info (PROGRAM_NAME);
 225     }
 226   exit (status);
 227 }
 228
 229 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 230    invalid.  Silently convert too-large values to SIZE_MAX.  */
 231
 232 static size_t
 233 size_opt (char const *opt, char const *msgid)
 234 {
 235   unsigned long int size;
 236   verify (SIZE_MAX <= ULONG_MAX);
 237
 238   switch (xstrtoul (opt, NULL, 10, &size, ""))
 239     {
 240     case LONGINT_OK:
 241     case LONGINT_OVERFLOW:
 242       break;
 243
 244     default:
 245       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 246     }
 247
 248   return MIN (size, SIZE_MAX);
 249 }
 250
 251 /* Given a linebuffer LINE,
 252    return a pointer to the beginning of the line's field to be compared. */
 253
 254 static char * _GL_ATTRIBUTE_PURE
 255 find_field (struct linebuffer const *line)
 256 {
 257   size_t count;
 258   char const *lp = line->buffer;
 259   size_t size = line->length - 1;
 260   size_t i = 0;
 261
 262   for (count = 0; count < skip_fields && i < size; count++)
 263     {
 264       while (i < size && isblank (to_uchar (lp[i])))
 265         i++;
 266       while (i < size && !isblank (to_uchar (lp[i])))
 267         i++;
 268     }
 269
 270   i += MIN (skip_chars, size - i);
 271
 272   return line->buffer + i;
 273 }
 274
 275 /* Return false if two strings OLD and NEW match, true if not.
 276    OLD and NEW point not to the beginnings of the lines
 277    but rather to the beginnings of the fields to compare.
 278    OLDLEN and NEWLEN are their lengths. */
 279
 280 static bool
 281 different (char *old, char *new, size_t oldlen, size_t newlen)
 282 {
 283   if (check_chars < oldlen)
 284     oldlen = check_chars;
 285   if (check_chars < newlen)
 286     newlen = check_chars;
 287
 288   if (ignore_case)
 289     {
 290       /* FIXME: This should invoke strcoll somehow.  */
 291       return oldlen != newlen || memcasecmp (old, new, oldlen);
 292     }
 293   else if (hard_LC_COLLATE)
 294     return xmemcoll (old, oldlen, new, newlen) != 0;
 295   else
 296     return oldlen != newlen || memcmp (old, new, oldlen);
 297 }
 298
 299 /* Output the line in linebuffer LINE to standard output
 300    provided that the switches say it should be output.
 301    MATCH is true if the line matches the previous line.
 302    If requested, print the number of times it occurred, as well;
 303    LINECOUNT + 1 is the number of times that the line occurred. */
 304
 305 static void
 306 writeline (struct linebuffer const *line,
 307            bool match, uintmax_t linecount)
 308 {
 309   if (! (linecount == 0 ? output_unique
 310          : !match ? output_first_repeated
 311          : output_later_repeated))
 312     return;
 313
 314   if (countmode == count_occurrences)
 315     printf ("%7" PRIuMAX " ", linecount + 1);
 316
 317   fwrite (line->buffer, sizeof (char), line->length, stdout);
 318 }
 319
 320 /* Process input file INFILE with output to OUTFILE.
 321    If either is "-", use the standard I/O stream for it instead. */
 322
 323 static void
 324 check_file (const char *infile, const char *outfile, char delimiter)
 325 {
 326   struct linebuffer lb1, lb2;
 327   struct linebuffer *thisline, *prevline;
 328
 329   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 330     error (EXIT_FAILURE, errno, "%s", quote (infile));
 331   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 332     error (EXIT_FAILURE, errno, "%s", quote (outfile));
 333
 334   fadvise (stdin, FADVISE_SEQUENTIAL);
 335
 336   thisline = &lb1;
 337   prevline = &lb2;
 338
 339   initbuffer (thisline);
 340   initbuffer (prevline);
 341
 342   /* The duplication in the following 'if' and 'else' blocks is an
 343      optimization to distinguish between when we can print input
 344      lines immediately (1. & 2.) or not.
 345
 346      1. --group => all input lines are printed.
 347         checking for unique/duplicated lines is used only for printing
 348         group separators.
 349
 350      2. The default case in which none of these options has been specified:
 351           --count, --repeated,  --all-repeated, --unique
 352         In the default case, this optimization lets uniq output each different
 353         line right away, without waiting to see if the next one is different.
 354
 355      3. All other cases.
 356   */
 357   if (output_unique && output_first_repeated && countmode == count_none)
 358     {
 359       char *prevfield IF_LINT ( = NULL);
 360       size_t prevlen IF_LINT ( = 0);
 361       bool first_group_printed = false;
 362
 363       while (!feof (stdin))
 364         {
 365           char *thisfield;
 366           size_t thislen;
 367           bool new_group;
 368
 369           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 370             break;
 371
 372           thisfield = find_field (thisline);
 373           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 374
 375           new_group = (prevline->length == 0
 376                        || different (thisfield, prevfield, thislen, prevlen));
 377
 378           if (new_group && grouping != GM_NONE
 379               && (grouping == GM_PREPEND || grouping == GM_BOTH
 380                   || (first_group_printed && (grouping == GM_APPEND
 381                                               || grouping == GM_SEPARATE))))
 382             putchar (delimiter);
 383
 384           if (new_group || grouping != GM_NONE)
 385             {
 386               fwrite (thisline->buffer, sizeof (char),
 387                       thisline->length, stdout);
 388
 389               SWAP_LINES (prevline, thisline);
 390               prevfield = thisfield;
 391               prevlen = thislen;
 392               first_group_printed = true;
 393             }
 394         }
 395       if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
 396         putchar (delimiter);
 397     }
 398   else
 399     {
 400       char *prevfield;
 401       size_t prevlen;
 402       uintmax_t match_count = 0;
 403       bool first_delimiter = true;
 404
 405       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 406         goto closefiles;
 407       prevfield = find_field (prevline);
 408       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 409
 410       while (!feof (stdin))
 411         {
 412           bool match;
 413           char *thisfield;
 414           size_t thislen;
 415           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 416             {
 417               if (ferror (stdin))
 418                 goto closefiles;
 419               break;
 420             }
 421           thisfield = find_field (thisline);
 422           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 423           match = !different (thisfield, prevfield, thislen, prevlen);
 424           match_count += match;
 425
 426           if (match_count == UINTMAX_MAX)
 427             {
 428               if (count_occurrences)
 429                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 430               match_count--;
 431             }
 432
 433           if (delimit_groups != DM_NONE)
 434             {
 435               if (!match)
 436                 {
 437                   if (match_count) /* a previous match */
 438                     first_delimiter = false; /* Only used when DM_SEPARATE */
 439                 }
 440               else if (match_count == 1)
 441                 {
 442                   if ((delimit_groups == DM_PREPEND)
 443                       || (delimit_groups == DM_SEPARATE
 444                           && !first_delimiter))
 445                     putchar (delimiter);
 446                 }
 447             }
 448
 449           if (!match || output_later_repeated)
 450             {
 451               writeline (prevline, match, match_count);
 452               SWAP_LINES (prevline, thisline);
 453               prevfield = thisfield;
 454               prevlen = thislen;
 455               if (!match)
 456                 match_count = 0;
 457             }
 458         }
 459
 460       writeline (prevline, false, match_count);
 461     }
 462
 463  closefiles:
 464   if (ferror (stdin) || fclose (stdin) != 0)
 465     error (EXIT_FAILURE, 0, _("error reading %s"), quote (infile));
 466
 467   /* stdout is handled via the atexit-invoked close_stdout function.  */
 468
 469   free (lb1.buffer);
 470   free (lb2.buffer);
 471 }
 472
 473 enum Skip_field_option_type
 474   {
 475     SFO_NONE,
 476     SFO_OBSOLETE,
 477     SFO_NEW
 478   };
 479
 480 int
 481 main (int argc, char **argv)
 482 {
 483   int optc = 0;
 484   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 485   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 486   unsigned int nfiles = 0;
 487   char const *file[2];
 488   char delimiter = '\n';        /* change with --zero-terminated, -z */
 489   bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
 490
 491   file[0] = file[1] = "-";
 492   initialize_main (&argc, &argv);
 493   set_program_name (argv[0]);
 494   setlocale (LC_ALL, "");
 495   bindtextdomain (PACKAGE, LOCALEDIR);
 496   textdomain (PACKAGE);
 497   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 498
 499   atexit (close_stdout);
 500
 501   skip_chars = 0;
 502   skip_fields = 0;
 503   check_chars = SIZE_MAX;
 504   output_unique = output_first_repeated = true;
 505   output_later_repeated = false;
 506   countmode = count_none;
 507   delimit_groups = DM_NONE;
 508
 509   while (true)
 510     {
 511       /* Parse an operand with leading "+" as a file after "--" was
 512          seen; or if pedantic and a file was seen; or if not
 513          obsolete.  */
 514
 515       if (optc == -1
 516           || (posixly_correct && nfiles != 0)
 517           || ((optc = getopt_long (argc, argv,
 518                                    "-0123456789Dcdf:is:uw:z", longopts, NULL))
 519               == -1))
 520         {
 521           if (argc <= optind)
 522             break;
 523           if (nfiles == 2)
 524             {
 525               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 526               usage (EXIT_FAILURE);
 527             }
 528           file[nfiles++] = argv[optind++];
 529         }
 530       else switch (optc)
 531         {
 532         case 1:
 533           {
 534             unsigned long int size;
 535             if (optarg[0] == '+'
 536                 && posix2_version () < 200112
 537                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 538                 && size <= SIZE_MAX)
 539               skip_chars = size;
 540             else if (nfiles == 2)
 541               {
 542                 error (0, 0, _("extra operand %s"), quote (optarg));
 543                 usage (EXIT_FAILURE);
 544               }
 545             else
 546               file[nfiles++] = optarg;
 547           }
 548           break;
 549
 550         case '0':
 551         case '1':
 552         case '2':
 553         case '3':
 554         case '4':
 555         case '5':
 556         case '6':
 557         case '7':
 558         case '8':
 559         case '9':
 560           {
 561             if (skip_field_option_type == SFO_NEW)
 562               skip_fields = 0;
 563
 564             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 565               skip_fields = SIZE_MAX;
 566
 567             skip_field_option_type = SFO_OBSOLETE;
 568           }
 569           break;
 570
 571         case 'c':
 572           countmode = count_occurrences;
 573           output_option_used = true;
 574           break;
 575
 576         case 'd':
 577           output_unique = false;
 578           output_option_used = true;
 579           break;
 580
 581         case 'D':
 582           output_unique = false;
 583           output_later_repeated = true;
 584           if (optarg == NULL)
 585             delimit_groups = DM_NONE;
 586           else
 587             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 588                                         delimit_method_string,
 589                                         delimit_method_map);
 590           output_option_used = true;
 591           break;
 592
 593         case GROUP_OPTION:
 594           if (optarg == NULL)
 595             grouping = GM_SEPARATE;
 596           else
 597             grouping = XARGMATCH ("--group", optarg,
 598                                   grouping_method_string,
 599                                   grouping_method_map);
 600           break;
 601
 602         case 'f':
 603           skip_field_option_type = SFO_NEW;
 604           skip_fields = size_opt (optarg,
 605                                   N_("invalid number of fields to skip"));
 606           break;
 607
 608         case 'i':
 609           ignore_case = true;
 610           break;
 611
 612         case 's':
 613           skip_chars = size_opt (optarg,
 614                                  N_("invalid number of bytes to skip"));
 615           break;
 616
 617         case 'u':
 618           output_first_repeated = false;
 619           output_option_used = true;
 620           break;
 621
 622         case 'w':
 623           check_chars = size_opt (optarg,
 624                                   N_("invalid number of bytes to compare"));
 625           break;
 626
 627         case 'z':
 628           delimiter = '\0';
 629           break;
 630
 631         case_GETOPT_HELP_CHAR;
 632
 633         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 634
 635         default:
 636           usage (EXIT_FAILURE);
 637         }
 638     }
 639
 640   /* Note we could allow --group with -D at least, and that would
 641      avoid the need to specify a grouping method to --all-repeated.
 642      It was thought best to avoid deprecating those parameters though
 643      and keep --group separate to other options.  */
 644   if (grouping != GM_NONE && output_option_used)
 645     {
 646       error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
 647       usage (EXIT_FAILURE);
 648     }
 649
 650   if (grouping != GM_NONE && countmode != count_none)
 651     {
 652       error (0, 0,
 653            _("grouping and printing repeat counts is meaningless"));
 654       usage (EXIT_FAILURE);
 655     }
 656
 657   if (countmode == count_occurrences && output_later_repeated)
 658     {
 659       error (0, 0,
 660            _("printing all duplicated lines and repeat counts is meaningless"));
 661       usage (EXIT_FAILURE);
 662     }
 663
 664   check_file (file[0], file[1], delimiter);
 665
 666   return EXIT_SUCCESS;
 667 }