1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
22 #include <sys/types.h>
26 #include "linebuffer.h"
30 #include "skipchars.h"
33 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no 'g' prefix). */
37 #define PROGRAM_NAME "uniq"
40 proper_name ("Richard M. Stallman"), \
41 proper_name ("David MacKenzie")
44 swap_lines (struct linebuffer
**a
, struct linebuffer
**b
)
46 struct linebuffer
*tmp
= *a
;
51 /* Number of fields to skip on each line when doing comparisons. */
52 static idx_t skip_fields
= false;
54 /* Number of chars to skip after skipping any fields. */
55 static idx_t skip_chars
= false;
57 /* Number of chars to compare. */
58 static idx_t check_chars
= IDX_MAX
;
60 /* Whether to precede the output lines with a count of the number of
61 times they occurred in the input. */
62 static bool count_occurrences
= false;
64 /* Which lines to output: unique lines, the first of a group of
65 repeated lines, and the second and subsequent of a group of
67 static bool output_unique
= true;
68 static bool output_first_repeated
= true;
69 static bool output_later_repeated
= false;
71 /* If true, ignore case when comparing. */
72 static bool ignore_case
;
76 /* No delimiters output. --all-repeated[=none] */
79 /* Delimiter precedes all groups. --all-repeated=prepend */
82 /* Delimit all groups. --all-repeated=separate */
86 static char const *const delimit_method_string
[] =
88 "none", "prepend", "separate", nullptr
91 static enum delimit_method
const delimit_method_map
[] =
93 DM_NONE
, DM_PREPEND
, DM_SEPARATE
96 /* Select whether/how to delimit groups of duplicate lines. */
97 static enum delimit_method delimit_groups
= DM_NONE
;
101 /* No grouping, when "--group" isn't used */
104 /* Delimiter precedes all groups. --group=prepend */
107 /* Delimiter follows all groups. --group=append */
110 /* Delimiter between groups. --group[=separate] */
113 /* Delimiter before and after each group. --group=both */
117 static char const *const grouping_method_string
[] =
119 "prepend", "append", "separate", "both", nullptr
122 static enum grouping_method
const grouping_method_map
[] =
124 GM_PREPEND
, GM_APPEND
, GM_SEPARATE
, GM_BOTH
127 static enum grouping_method grouping
= GM_NONE
;
131 GROUP_OPTION
= CHAR_MAX
+ 1
134 static struct option
const longopts
[] =
136 {"count", no_argument
, nullptr, 'c'},
137 {"repeated", no_argument
, nullptr, 'd'},
138 {"all-repeated", optional_argument
, nullptr, 'D'},
139 {"group", optional_argument
, nullptr, GROUP_OPTION
},
140 {"ignore-case", no_argument
, nullptr, 'i'},
141 {"unique", no_argument
, nullptr, 'u'},
142 {"skip-fields", required_argument
, nullptr, 'f'},
143 {"skip-chars", required_argument
, nullptr, 's'},
144 {"check-chars", required_argument
, nullptr, 'w'},
145 {"zero-terminated", no_argument
, nullptr, 'z'},
146 {GETOPT_HELP_OPTION_DECL
},
147 {GETOPT_VERSION_OPTION_DECL
},
148 {nullptr, 0, nullptr, 0}
154 if (status
!= EXIT_SUCCESS
)
159 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
163 Filter adjacent matching lines from INPUT (or standard input),\n\
164 writing to OUTPUT (or standard output).\n\
166 With no options, matching lines are merged to the first occurrence.\n\
169 emit_mandatory_arg_note ();
172 -c, --count prefix lines by the number of occurrences\n\
173 -d, --repeated only print duplicate lines, one for each group\n\
176 -D print all duplicate lines\n\
177 --all-repeated[=METHOD] like -D, but allow separating groups\n\
178 with an empty line;\n\
179 METHOD={none(default),prepend,separate}\n\
182 -f, --skip-fields=N avoid comparing the first N fields\n\
185 --group[=METHOD] show all items, separating groups with an empty line;\n\
186 METHOD={separate(default),prepend,append,both}\n\
189 -i, --ignore-case ignore differences in case when comparing\n\
190 -s, --skip-chars=N avoid comparing the first N characters\n\
191 -u, --unique only print unique lines\n\
194 -z, --zero-terminated line delimiter is NUL, not newline\n\
197 -w, --check-chars=N compare no more than N characters in lines\n\
199 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
200 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
203 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
204 characters. Fields are skipped before chars.\n\
208 'uniq' does not detect repeated lines unless they are adjacent.\n\
209 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
211 emit_ancillary_info (PROGRAM_NAME
);
219 int posix_ver
= posix2_version ();
220 return 200112 <= posix_ver
&& posix_ver
< 200809;
223 /* Convert OPT to idx_t, reporting an error using MSGID if OPT is
224 invalid. Silently convert too-large values to IDX_MAX. */
227 size_opt (char const *opt
, char const *msgid
)
230 if (LONGINT_OVERFLOW
< xstrtoimax (opt
, nullptr, 10, &size
, "")
232 error (EXIT_FAILURE
, 0, "%s: %s", opt
, _(msgid
));
233 return MIN (size
, IDX_MAX
);
237 newline_or_blank (mcel_t g
)
239 return g
.ch
== '\n' || c32isblank (g
.ch
);
242 /* Given a linebuffer LINE,
243 return a pointer to the beginning of the line's field to be compared.
244 Store into *PLEN the length in bytes of FIELD. */
247 find_field (struct linebuffer
const *line
, idx_t
*plen
)
249 char *lp
= line
->buffer
;
250 char const *lim
= lp
+ line
->length
- 1;
252 for (idx_t i
= skip_fields
; 0 < i
&& lp
< lim
; i
--)
254 lp
= skip_buf_matching (lp
, lim
, newline_or_blank
, true);
255 lp
= skip_buf_matching (lp
, lim
, newline_or_blank
, false);
258 for (idx_t i
= skip_chars
; 0 < i
&& lp
< lim
; i
--)
259 lp
+= mcel_scan (lp
, lim
).len
;
261 /* Compute the length in bytes cheaply if possible; otherwise, scan. */
263 if (lim
- lp
<= check_chars
)
265 else if (MB_CUR_MAX
<= 1)
270 for (idx_t i
= check_chars
; 0 < i
&& lp
< lim
; i
--)
271 ep
+= mcel_scan (lp
, lim
).len
;
279 /* Return false if two strings OLD and NEW match, true if not.
280 OLD and NEW point not to the beginnings of the lines
281 but rather to the beginnings of the fields to compare.
282 OLDLEN and NEWLEN are their lengths. */
285 different (char *old
, char *new, idx_t oldlen
, idx_t newlen
)
288 return oldlen
!= newlen
|| memcasecmp (old
, new, oldlen
);
290 return oldlen
!= newlen
|| memcmp (old
, new, oldlen
);
293 /* Output the line in linebuffer LINE to standard output
294 provided that the switches say it should be output.
295 MATCH is true if the line matches the previous line.
296 If requested, print the number of times it occurred, as well;
297 LINECOUNT + 1 is the number of times that the line occurred. */
300 writeline (struct linebuffer
const *line
,
301 bool match
, intmax_t linecount
)
303 if (! (linecount
== 0 ? output_unique
304 : !match
? output_first_repeated
305 : output_later_repeated
))
308 if (count_occurrences
)
309 printf ("%7jd ", linecount
+ 1);
311 if (fwrite (line
->buffer
, sizeof (char), line
->length
, stdout
)
316 /* Process input file INFILE with output to OUTFILE.
317 If either is "-", use the standard I/O stream for it instead. */
320 check_file (char const *infile
, char const *outfile
, char delimiter
)
322 struct linebuffer lb1
, lb2
;
323 struct linebuffer
*thisline
, *prevline
;
325 if (! (STREQ (infile
, "-") || freopen (infile
, "r", stdin
)))
326 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
327 if (! (STREQ (outfile
, "-") || freopen (outfile
, "w", stdout
)))
328 error (EXIT_FAILURE
, errno
, "%s", quotef (outfile
));
330 fadvise (stdin
, FADVISE_SEQUENTIAL
);
335 initbuffer (thisline
);
336 initbuffer (prevline
);
338 /* The duplication in the following 'if' and 'else' blocks is an
339 optimization to distinguish between when we can print input
340 lines immediately (1. & 2.) or not.
342 1. --group => all input lines are printed.
343 checking for unique/duplicated lines is used only for printing
346 2. The default case in which none of these options has been specified:
347 --count, --repeated, --all-repeated, --unique
348 In the default case, this optimization lets uniq output each different
349 line right away, without waiting to see if the next one is different.
353 if (output_unique
&& output_first_repeated
&& !count_occurrences
)
355 char *prevfield
= nullptr;
357 bool first_group_printed
= false;
360 && readlinebuffer_delim (thisline
, stdin
, delimiter
) != 0)
363 char *thisfield
= find_field (thisline
, &thislen
);
364 bool new_group
= (!prevfield
365 || different (thisfield
, prevfield
,
368 if (new_group
&& grouping
!= GM_NONE
369 && (grouping
== GM_PREPEND
|| grouping
== GM_BOTH
370 || (first_group_printed
&& (grouping
== GM_APPEND
371 || grouping
== GM_SEPARATE
))))
374 if (new_group
|| grouping
!= GM_NONE
)
376 if (fwrite (thisline
->buffer
, sizeof (char), thisline
->length
,
377 stdout
) != thisline
->length
)
380 swap_lines (&prevline
, &thisline
);
381 prevfield
= thisfield
;
383 first_group_printed
= true;
386 if ((grouping
== GM_BOTH
|| grouping
== GM_APPEND
) && first_group_printed
)
391 if (readlinebuffer_delim (prevline
, stdin
, delimiter
) == 0)
395 char *prevfield
= find_field (prevline
, &prevlen
);
396 intmax_t match_count
= 0;
397 bool first_delimiter
= true;
399 while (!feof (stdin
))
401 if (readlinebuffer_delim (thisline
, stdin
, delimiter
) == 0)
408 char *thisfield
= find_field (thisline
, &thislen
);
409 bool match
= !different (thisfield
, prevfield
, thislen
, prevlen
);
410 match_count
+= match
;
412 if (match_count
== INTMAX_MAX
)
414 if (count_occurrences
)
415 error (EXIT_FAILURE
, 0, _("too many repeated lines"));
419 if (delimit_groups
!= DM_NONE
)
423 if (match_count
) /* a previous match */
424 first_delimiter
= false; /* Only used when DM_SEPARATE */
426 else if (match_count
== 1)
428 if ((delimit_groups
== DM_PREPEND
)
429 || (delimit_groups
== DM_SEPARATE
430 && !first_delimiter
))
435 if (!match
|| output_later_repeated
)
437 writeline (prevline
, match
, match_count
);
438 swap_lines (&prevline
, &thisline
);
439 prevfield
= thisfield
;
446 writeline (prevline
, false, match_count
);
450 if (ferror (stdin
) || fclose (stdin
) != 0)
451 error (EXIT_FAILURE
, errno
, _("error reading %s"), quoteaf (infile
));
453 /* stdout is handled via the atexit-invoked close_stdout function. */
459 enum Skip_field_option_type
467 main (int argc
, char **argv
)
470 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != nullptr);
471 enum Skip_field_option_type skip_field_option_type
= SFO_NONE
;
474 char delimiter
= '\n'; /* change with --zero-terminated, -z */
475 bool output_option_used
= false; /* if true, one of -u/-d/-D/-c was used */
477 file
[0] = file
[1] = "-";
478 initialize_main (&argc
, &argv
);
479 set_program_name (argv
[0]);
480 setlocale (LC_ALL
, "");
481 bindtextdomain (PACKAGE
, LOCALEDIR
);
482 textdomain (PACKAGE
);
484 atexit (close_stdout
);
488 /* Parse an operand with leading "+" as a file after "--" was
489 seen; or if pedantic and a file was seen; or if not
493 || (posixly_correct
&& nfiles
!= 0)
494 || ((optc
= getopt_long (argc
, argv
,
495 "-0123456789Dcdf:is:uw:z",
503 error (0, 0, _("extra operand %s"), quote (argv
[optind
]));
504 usage (EXIT_FAILURE
);
506 file
[nfiles
++] = argv
[optind
++];
514 && ! strict_posix2 ()
515 && (xstrtoimax (optarg
, nullptr, 10, &size
, "")
516 <= LONGINT_OVERFLOW
))
517 skip_chars
= MIN (size
, IDX_MAX
);
518 else if (nfiles
== 2)
520 error (0, 0, _("extra operand %s"), quote (optarg
));
521 usage (EXIT_FAILURE
);
524 file
[nfiles
++] = optarg
;
539 if (skip_field_option_type
== SFO_NEW
)
542 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields
, optc
- '0'))
543 skip_fields
= IDX_MAX
;
545 skip_field_option_type
= SFO_OBSOLETE
;
550 count_occurrences
= true;
551 output_option_used
= true;
555 output_unique
= false;
556 output_option_used
= true;
560 output_unique
= false;
561 output_later_repeated
= true;
562 if (optarg
== nullptr)
563 delimit_groups
= DM_NONE
;
565 delimit_groups
= XARGMATCH ("--all-repeated", optarg
,
566 delimit_method_string
,
568 output_option_used
= true;
572 if (optarg
== nullptr)
573 grouping
= GM_SEPARATE
;
575 grouping
= XARGMATCH ("--group", optarg
,
576 grouping_method_string
,
577 grouping_method_map
);
581 skip_field_option_type
= SFO_NEW
;
582 skip_fields
= size_opt (optarg
,
583 N_("invalid number of fields to skip"));
591 skip_chars
= size_opt (optarg
,
592 N_("invalid number of bytes to skip"));
596 output_first_repeated
= false;
597 output_option_used
= true;
601 check_chars
= size_opt (optarg
,
602 N_("invalid number of bytes to compare"));
609 case_GETOPT_HELP_CHAR
;
611 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
614 usage (EXIT_FAILURE
);
618 /* Note we could allow --group with -D at least, and that would
619 avoid the need to specify a grouping method to --all-repeated.
620 It was thought best to avoid deprecating those parameters though
621 and keep --group separate to other options. */
622 if (grouping
!= GM_NONE
&& output_option_used
)
624 error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
625 usage (EXIT_FAILURE
);
628 if (grouping
!= GM_NONE
&& count_occurrences
)
631 _("grouping and printing repeat counts is meaningless"));
632 usage (EXIT_FAILURE
);
635 if (count_occurrences
&& output_later_repeated
)
638 _("printing all duplicated lines and repeat counts is meaningless"));
639 usage (EXIT_FAILURE
);
642 check_file (file
[0], file
[1], delimiter
);