build: update gnulib submodule to latest
[coreutils.git] / src / uniq.c
blobc85be6ee811b84bb384e1d24fb1412d3458eee29
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986-2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
19 #include <config.h>
21 #include <getopt.h>
22 #include <sys/types.h>
24 #include "system.h"
25 #include "argmatch.h"
26 #include "linebuffer.h"
27 #include "error.h"
28 #include "fadvise.h"
29 #include "hard-locale.h"
30 #include "posixver.h"
31 #include "quote.h"
32 #include "stdio--.h"
33 #include "xmemcoll.h"
34 #include "xstrtol.h"
35 #include "memcasecmp.h"
37 /* The official name of this program (e.g., no 'g' prefix). */
38 #define PROGRAM_NAME "uniq"
40 #define AUTHORS \
41 proper_name ("Richard M. Stallman"), \
42 proper_name ("David MacKenzie")
44 #define SWAP_LINES(A, B) \
45 do \
46 { \
47 struct linebuffer *_tmp; \
48 _tmp = (A); \
49 (A) = (B); \
50 (B) = _tmp; \
51 } \
52 while (0)
54 /* True if the LC_COLLATE locale is hard. */
55 static bool hard_LC_COLLATE;
57 /* Number of fields to skip on each line when doing comparisons. */
58 static size_t skip_fields;
60 /* Number of chars to skip after skipping any fields. */
61 static size_t skip_chars;
63 /* Number of chars to compare. */
64 static size_t check_chars;
66 enum countmode
68 count_occurrences, /* -c Print count before output lines. */
69 count_none /* Default. Do not print counts. */
72 /* Whether and how to precede the output lines with a count of the number of
73 times they occurred in the input. */
74 static enum countmode countmode;
76 /* Which lines to output: unique lines, the first of a group of
77 repeated lines, and the second and subsequented of a group of
78 repeated lines. */
79 static bool output_unique;
80 static bool output_first_repeated;
81 static bool output_later_repeated;
83 /* If true, ignore case when comparing. */
84 static bool ignore_case;
86 enum delimit_method
88 /* No delimiters output. --all-repeated[=none] */
89 DM_NONE,
91 /* Delimiter precedes all groups. --all-repeated=prepend */
92 DM_PREPEND,
94 /* Delimit all groups. --all-repeated=separate */
95 DM_SEPARATE
98 static char const *const delimit_method_string[] =
100 "none", "prepend", "separate", NULL
103 static enum delimit_method const delimit_method_map[] =
105 DM_NONE, DM_PREPEND, DM_SEPARATE
108 /* Select whether/how to delimit groups of duplicate lines. */
109 static enum delimit_method delimit_groups;
111 enum grouping_method
113 /* No grouping, when "--group" isn't used */
114 GM_NONE,
116 /* Delimiter preceges all groups. --group=prepend */
117 GM_PREPEND,
119 /* Delimiter follows all groups. --group=append */
120 GM_APPEND,
122 /* Delimiter between groups. --group[=separate] */
123 GM_SEPARATE,
125 /* Delimiter before and after each group. --group=both */
126 GM_BOTH
129 static char const *const grouping_method_string[] =
131 "prepend", "append", "separate", "both", NULL
134 static enum grouping_method const grouping_method_map[] =
136 GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
139 static enum grouping_method grouping = GM_NONE;
141 enum
143 GROUP_OPTION = CHAR_MAX + 1
146 static struct option const longopts[] =
148 {"count", no_argument, NULL, 'c'},
149 {"repeated", no_argument, NULL, 'd'},
150 {"all-repeated", optional_argument, NULL, 'D'},
151 {"group", optional_argument, NULL, GROUP_OPTION},
152 {"ignore-case", no_argument, NULL, 'i'},
153 {"unique", no_argument, NULL, 'u'},
154 {"skip-fields", required_argument, NULL, 'f'},
155 {"skip-chars", required_argument, NULL, 's'},
156 {"check-chars", required_argument, NULL, 'w'},
157 {"zero-terminated", no_argument, NULL, 'z'},
158 {GETOPT_HELP_OPTION_DECL},
159 {GETOPT_VERSION_OPTION_DECL},
160 {NULL, 0, NULL, 0}
163 void
164 usage (int status)
166 if (status != EXIT_SUCCESS)
167 emit_try_help ();
168 else
170 printf (_("\
171 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
173 program_name);
174 fputs (_("\
175 Filter adjacent matching lines from INPUT (or standard input),\n\
176 writing to OUTPUT (or standard output).\n\
178 With no options, matching lines are merged to the first occurrence.\n\
179 "), stdout);
181 emit_mandatory_arg_note ();
183 fputs (_("\
184 -c, --count prefix lines by the number of occurrences\n\
185 -d, --repeated only print duplicate lines, one for each group\n\
186 "), stdout);
187 fputs (_("\
188 -D print all duplicate lines\n\
189 --all-repeated[=METHOD] like -D, but allow separating groups\n\
190 with an empty line;\n\
191 METHOD={none(default),prepend,separate}\n\
192 "), stdout);
193 fputs (_("\
194 -f, --skip-fields=N avoid comparing the first N fields\n\
195 "), stdout);
196 fputs (_("\
197 --group[=METHOD] show all items, separating groups with an empty line;\n\
198 METHOD={separate(default),prepend,append,both}\n\
199 "), stdout);
200 fputs (_("\
201 -i, --ignore-case ignore differences in case when comparing\n\
202 -s, --skip-chars=N avoid comparing the first N characters\n\
203 -u, --unique only print unique lines\n\
204 "), stdout);
205 fputs (_("\
206 -z, --zero-terminated line delimiter is NUL, not newline\n\
207 "), stdout);
208 fputs (_("\
209 -w, --check-chars=N compare no more than N characters in lines\n\
210 "), stdout);
211 fputs (HELP_OPTION_DESCRIPTION, stdout);
212 fputs (VERSION_OPTION_DESCRIPTION, stdout);
213 fputs (_("\
215 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
216 characters. Fields are skipped before chars.\n\
217 "), stdout);
218 fputs (_("\
220 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
221 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
222 Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
223 "), stdout);
224 emit_ancillary_info (PROGRAM_NAME);
226 exit (status);
229 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
230 invalid. Silently convert too-large values to SIZE_MAX. */
232 static size_t
233 size_opt (char const *opt, char const *msgid)
235 unsigned long int size;
236 verify (SIZE_MAX <= ULONG_MAX);
238 switch (xstrtoul (opt, NULL, 10, &size, ""))
240 case LONGINT_OK:
241 case LONGINT_OVERFLOW:
242 break;
244 default:
245 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
248 return MIN (size, SIZE_MAX);
251 /* Given a linebuffer LINE,
252 return a pointer to the beginning of the line's field to be compared. */
254 static char * _GL_ATTRIBUTE_PURE
255 find_field (struct linebuffer const *line)
257 size_t count;
258 char const *lp = line->buffer;
259 size_t size = line->length - 1;
260 size_t i = 0;
262 for (count = 0; count < skip_fields && i < size; count++)
264 while (i < size && isblank (to_uchar (lp[i])))
265 i++;
266 while (i < size && !isblank (to_uchar (lp[i])))
267 i++;
270 i += MIN (skip_chars, size - i);
272 return line->buffer + i;
275 /* Return false if two strings OLD and NEW match, true if not.
276 OLD and NEW point not to the beginnings of the lines
277 but rather to the beginnings of the fields to compare.
278 OLDLEN and NEWLEN are their lengths. */
280 static bool
281 different (char *old, char *new, size_t oldlen, size_t newlen)
283 if (check_chars < oldlen)
284 oldlen = check_chars;
285 if (check_chars < newlen)
286 newlen = check_chars;
288 if (ignore_case)
290 /* FIXME: This should invoke strcoll somehow. */
291 return oldlen != newlen || memcasecmp (old, new, oldlen);
293 else if (hard_LC_COLLATE)
294 return xmemcoll (old, oldlen, new, newlen) != 0;
295 else
296 return oldlen != newlen || memcmp (old, new, oldlen);
299 /* Output the line in linebuffer LINE to standard output
300 provided that the switches say it should be output.
301 MATCH is true if the line matches the previous line.
302 If requested, print the number of times it occurred, as well;
303 LINECOUNT + 1 is the number of times that the line occurred. */
305 static void
306 writeline (struct linebuffer const *line,
307 bool match, uintmax_t linecount)
309 if (! (linecount == 0 ? output_unique
310 : !match ? output_first_repeated
311 : output_later_repeated))
312 return;
314 if (countmode == count_occurrences)
315 printf ("%7" PRIuMAX " ", linecount + 1);
317 fwrite (line->buffer, sizeof (char), line->length, stdout);
320 /* Process input file INFILE with output to OUTFILE.
321 If either is "-", use the standard I/O stream for it instead. */
323 static void
324 check_file (const char *infile, const char *outfile, char delimiter)
326 struct linebuffer lb1, lb2;
327 struct linebuffer *thisline, *prevline;
329 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
330 error (EXIT_FAILURE, errno, "%s", quote (infile));
331 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
332 error (EXIT_FAILURE, errno, "%s", quote (outfile));
334 fadvise (stdin, FADVISE_SEQUENTIAL);
336 thisline = &lb1;
337 prevline = &lb2;
339 initbuffer (thisline);
340 initbuffer (prevline);
342 /* The duplication in the following 'if' and 'else' blocks is an
343 optimization to distinguish between when we can print input
344 lines immediately (1. & 2.) or not.
346 1. --group => all input lines are printed.
347 checking for unique/duplicated lines is used only for printing
348 group separators.
350 2. The default case in which none of these options has been specified:
351 --count, --repeated, --all-repeated, --unique
352 In the default case, this optimization lets uniq output each different
353 line right away, without waiting to see if the next one is different.
355 3. All other cases.
357 if (output_unique && output_first_repeated && countmode == count_none)
359 char *prevfield IF_LINT ( = NULL);
360 size_t prevlen IF_LINT ( = 0);
361 bool first_group_printed = false;
363 while (!feof (stdin))
365 char *thisfield;
366 size_t thislen;
367 bool new_group;
369 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
370 break;
372 thisfield = find_field (thisline);
373 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
375 new_group = (prevline->length == 0
376 || different (thisfield, prevfield, thislen, prevlen));
378 if (new_group && grouping != GM_NONE
379 && (grouping == GM_PREPEND || grouping == GM_BOTH
380 || (first_group_printed && (grouping == GM_APPEND
381 || grouping == GM_SEPARATE))))
382 putchar (delimiter);
384 if (new_group || grouping != GM_NONE)
386 fwrite (thisline->buffer, sizeof (char),
387 thisline->length, stdout);
389 SWAP_LINES (prevline, thisline);
390 prevfield = thisfield;
391 prevlen = thislen;
392 first_group_printed = true;
395 if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
396 putchar (delimiter);
398 else
400 char *prevfield;
401 size_t prevlen;
402 uintmax_t match_count = 0;
403 bool first_delimiter = true;
405 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
406 goto closefiles;
407 prevfield = find_field (prevline);
408 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
410 while (!feof (stdin))
412 bool match;
413 char *thisfield;
414 size_t thislen;
415 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
417 if (ferror (stdin))
418 goto closefiles;
419 break;
421 thisfield = find_field (thisline);
422 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
423 match = !different (thisfield, prevfield, thislen, prevlen);
424 match_count += match;
426 if (match_count == UINTMAX_MAX)
428 if (count_occurrences)
429 error (EXIT_FAILURE, 0, _("too many repeated lines"));
430 match_count--;
433 if (delimit_groups != DM_NONE)
435 if (!match)
437 if (match_count) /* a previous match */
438 first_delimiter = false; /* Only used when DM_SEPARATE */
440 else if (match_count == 1)
442 if ((delimit_groups == DM_PREPEND)
443 || (delimit_groups == DM_SEPARATE
444 && !first_delimiter))
445 putchar (delimiter);
449 if (!match || output_later_repeated)
451 writeline (prevline, match, match_count);
452 SWAP_LINES (prevline, thisline);
453 prevfield = thisfield;
454 prevlen = thislen;
455 if (!match)
456 match_count = 0;
460 writeline (prevline, false, match_count);
463 closefiles:
464 if (ferror (stdin) || fclose (stdin) != 0)
465 error (EXIT_FAILURE, 0, _("error reading %s"), quote (infile));
467 /* stdout is handled via the atexit-invoked close_stdout function. */
469 free (lb1.buffer);
470 free (lb2.buffer);
473 enum Skip_field_option_type
475 SFO_NONE,
476 SFO_OBSOLETE,
477 SFO_NEW
481 main (int argc, char **argv)
483 int optc = 0;
484 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
485 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
486 unsigned int nfiles = 0;
487 char const *file[2];
488 char delimiter = '\n'; /* change with --zero-terminated, -z */
489 bool output_option_used = false; /* if true, one of -u/-d/-D/-c was used */
491 file[0] = file[1] = "-";
492 initialize_main (&argc, &argv);
493 set_program_name (argv[0]);
494 setlocale (LC_ALL, "");
495 bindtextdomain (PACKAGE, LOCALEDIR);
496 textdomain (PACKAGE);
497 hard_LC_COLLATE = hard_locale (LC_COLLATE);
499 atexit (close_stdout);
501 skip_chars = 0;
502 skip_fields = 0;
503 check_chars = SIZE_MAX;
504 output_unique = output_first_repeated = true;
505 output_later_repeated = false;
506 countmode = count_none;
507 delimit_groups = DM_NONE;
509 while (true)
511 /* Parse an operand with leading "+" as a file after "--" was
512 seen; or if pedantic and a file was seen; or if not
513 obsolete. */
515 if (optc == -1
516 || (posixly_correct && nfiles != 0)
517 || ((optc = getopt_long (argc, argv,
518 "-0123456789Dcdf:is:uw:z", longopts, NULL))
519 == -1))
521 if (argc <= optind)
522 break;
523 if (nfiles == 2)
525 error (0, 0, _("extra operand %s"), quote (argv[optind]));
526 usage (EXIT_FAILURE);
528 file[nfiles++] = argv[optind++];
530 else switch (optc)
532 case 1:
534 unsigned long int size;
535 if (optarg[0] == '+'
536 && posix2_version () < 200112
537 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
538 && size <= SIZE_MAX)
539 skip_chars = size;
540 else if (nfiles == 2)
542 error (0, 0, _("extra operand %s"), quote (optarg));
543 usage (EXIT_FAILURE);
545 else
546 file[nfiles++] = optarg;
548 break;
550 case '0':
551 case '1':
552 case '2':
553 case '3':
554 case '4':
555 case '5':
556 case '6':
557 case '7':
558 case '8':
559 case '9':
561 if (skip_field_option_type == SFO_NEW)
562 skip_fields = 0;
564 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
565 skip_fields = SIZE_MAX;
567 skip_field_option_type = SFO_OBSOLETE;
569 break;
571 case 'c':
572 countmode = count_occurrences;
573 output_option_used = true;
574 break;
576 case 'd':
577 output_unique = false;
578 output_option_used = true;
579 break;
581 case 'D':
582 output_unique = false;
583 output_later_repeated = true;
584 if (optarg == NULL)
585 delimit_groups = DM_NONE;
586 else
587 delimit_groups = XARGMATCH ("--all-repeated", optarg,
588 delimit_method_string,
589 delimit_method_map);
590 output_option_used = true;
591 break;
593 case GROUP_OPTION:
594 if (optarg == NULL)
595 grouping = GM_SEPARATE;
596 else
597 grouping = XARGMATCH ("--group", optarg,
598 grouping_method_string,
599 grouping_method_map);
600 break;
602 case 'f':
603 skip_field_option_type = SFO_NEW;
604 skip_fields = size_opt (optarg,
605 N_("invalid number of fields to skip"));
606 break;
608 case 'i':
609 ignore_case = true;
610 break;
612 case 's':
613 skip_chars = size_opt (optarg,
614 N_("invalid number of bytes to skip"));
615 break;
617 case 'u':
618 output_first_repeated = false;
619 output_option_used = true;
620 break;
622 case 'w':
623 check_chars = size_opt (optarg,
624 N_("invalid number of bytes to compare"));
625 break;
627 case 'z':
628 delimiter = '\0';
629 break;
631 case_GETOPT_HELP_CHAR;
633 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
635 default:
636 usage (EXIT_FAILURE);
640 /* Note we could allow --group with -D at least, and that would
641 avoid the need to specify a grouping method to --all-repeated.
642 It was thought best to avoid deprecating those parameters though
643 and keep --group separate to other options. */
644 if (grouping != GM_NONE && output_option_used)
646 error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
647 usage (EXIT_FAILURE);
650 if (grouping != GM_NONE && countmode != count_none)
652 error (0, 0,
653 _("grouping and printing repeat counts is meaningless"));
654 usage (EXIT_FAILURE);
657 if (countmode == count_occurrences && output_later_repeated)
659 error (0, 0,
660 _("printing all duplicated lines and repeat counts is meaningless"));
661 usage (EXIT_FAILURE);
664 check_file (file[0], file[1], delimiter);
666 return EXIT_SUCCESS;