version 8.7
[coreutils.git] / src / uniq.c
blob86ca8c9579592ab05a0048bb13ea9699aaf9e0da
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986, 1991, 1995-2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
19 #include <config.h>
21 #include <getopt.h>
22 #include <sys/types.h>
24 #include "system.h"
25 #include "argmatch.h"
26 #include "linebuffer.h"
27 #include "error.h"
28 #include "fadvise.h"
29 #include "hard-locale.h"
30 #include "posixver.h"
31 #include "quote.h"
32 #include "stdio--.h"
33 #include "xmemcoll.h"
34 #include "xstrtol.h"
35 #include "memcasecmp.h"
37 /* The official name of this program (e.g., no `g' prefix). */
38 #define PROGRAM_NAME "uniq"
40 #define AUTHORS \
41 proper_name ("Richard M. Stallman"), \
42 proper_name ("David MacKenzie")
44 #define SWAP_LINES(A, B) \
45 do \
46 { \
47 struct linebuffer *_tmp; \
48 _tmp = (A); \
49 (A) = (B); \
50 (B) = _tmp; \
51 } \
52 while (0)
54 /* True if the LC_COLLATE locale is hard. */
55 static bool hard_LC_COLLATE;
57 /* Number of fields to skip on each line when doing comparisons. */
58 static size_t skip_fields;
60 /* Number of chars to skip after skipping any fields. */
61 static size_t skip_chars;
63 /* Number of chars to compare. */
64 static size_t check_chars;
66 enum countmode
68 count_occurrences, /* -c Print count before output lines. */
69 count_none /* Default. Do not print counts. */
72 /* Whether and how to precede the output lines with a count of the number of
73 times they occurred in the input. */
74 static enum countmode countmode;
76 /* Which lines to output: unique lines, the first of a group of
77 repeated lines, and the second and subsequented of a group of
78 repeated lines. */
79 static bool output_unique;
80 static bool output_first_repeated;
81 static bool output_later_repeated;
83 /* If true, ignore case when comparing. */
84 static bool ignore_case;
86 enum delimit_method
88 /* No delimiters output. --all-repeated[=none] */
89 DM_NONE,
91 /* Delimiter precedes all groups. --all-repeated=prepend */
92 DM_PREPEND,
94 /* Delimit all groups. --all-repeated=separate */
95 DM_SEPARATE
98 static char const *const delimit_method_string[] =
100 "none", "prepend", "separate", NULL
103 static enum delimit_method const delimit_method_map[] =
105 DM_NONE, DM_PREPEND, DM_SEPARATE
108 /* Select whether/how to delimit groups of duplicate lines. */
109 static enum delimit_method delimit_groups;
111 static struct option const longopts[] =
113 {"count", no_argument, NULL, 'c'},
114 {"repeated", no_argument, NULL, 'd'},
115 {"all-repeated", optional_argument, NULL, 'D'},
116 {"ignore-case", no_argument, NULL, 'i'},
117 {"unique", no_argument, NULL, 'u'},
118 {"skip-fields", required_argument, NULL, 'f'},
119 {"skip-chars", required_argument, NULL, 's'},
120 {"check-chars", required_argument, NULL, 'w'},
121 {"zero-terminated", no_argument, NULL, 'z'},
122 {GETOPT_HELP_OPTION_DECL},
123 {GETOPT_VERSION_OPTION_DECL},
124 {NULL, 0, NULL, 0}
127 void
128 usage (int status)
130 if (status != EXIT_SUCCESS)
131 fprintf (stderr, _("Try `%s --help' for more information.\n"),
132 program_name);
133 else
135 printf (_("\
136 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
138 program_name);
139 fputs (_("\
140 Filter adjacent matching lines from INPUT (or standard input),\n\
141 writing to OUTPUT (or standard output).\n\
143 With no options, matching lines are merged to the first occurrence.\n\
145 "), stdout);
146 fputs (_("\
147 Mandatory arguments to long options are mandatory for short options too.\n\
148 "), stdout);
149 fputs (_("\
150 -c, --count prefix lines by the number of occurrences\n\
151 -d, --repeated only print duplicate lines\n\
152 "), stdout);
153 fputs (_("\
154 -D, --all-repeated[=delimit-method] print all duplicate lines\n\
155 delimit-method={none(default),prepend,separate}\n\
156 Delimiting is done with blank lines\n\
157 -f, --skip-fields=N avoid comparing the first N fields\n\
158 -i, --ignore-case ignore differences in case when comparing\n\
159 -s, --skip-chars=N avoid comparing the first N characters\n\
160 -u, --unique only print unique lines\n\
161 -z, --zero-terminated end lines with 0 byte, not newline\n\
162 "), stdout);
163 fputs (_("\
164 -w, --check-chars=N compare no more than N characters in lines\n\
165 "), stdout);
166 fputs (HELP_OPTION_DESCRIPTION, stdout);
167 fputs (VERSION_OPTION_DESCRIPTION, stdout);
168 fputs (_("\
170 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
171 characters. Fields are skipped before chars.\n\
172 "), stdout);
173 fputs (_("\
175 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
176 You may want to sort the input first, or use `sort -u' without `uniq'.\n\
177 Also, comparisons honor the rules specified by `LC_COLLATE'.\n\
178 "), stdout);
179 emit_ancillary_info ();
181 exit (status);
184 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
185 invalid. Silently convert too-large values to SIZE_MAX. */
187 static size_t
188 size_opt (char const *opt, char const *msgid)
190 unsigned long int size;
191 verify (SIZE_MAX <= ULONG_MAX);
193 switch (xstrtoul (opt, NULL, 10, &size, ""))
195 case LONGINT_OK:
196 case LONGINT_OVERFLOW:
197 break;
199 default:
200 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
203 return MIN (size, SIZE_MAX);
206 /* Given a linebuffer LINE,
207 return a pointer to the beginning of the line's field to be compared. */
209 static char *
210 find_field (struct linebuffer const *line)
212 size_t count;
213 char const *lp = line->buffer;
214 size_t size = line->length - 1;
215 size_t i = 0;
217 for (count = 0; count < skip_fields; count++)
219 while (i < size && isblank (to_uchar (lp[i])))
220 i++;
221 while (i < size && !isblank (to_uchar (lp[i])))
222 i++;
225 for (count = 0; count < skip_chars && i < size; count++)
226 i++;
228 return line->buffer + i;
231 /* Return false if two strings OLD and NEW match, true if not.
232 OLD and NEW point not to the beginnings of the lines
233 but rather to the beginnings of the fields to compare.
234 OLDLEN and NEWLEN are their lengths. */
236 static bool
237 different (char *old, char *new, size_t oldlen, size_t newlen)
239 if (check_chars < oldlen)
240 oldlen = check_chars;
241 if (check_chars < newlen)
242 newlen = check_chars;
244 if (ignore_case)
246 /* FIXME: This should invoke strcoll somehow. */
247 return oldlen != newlen || memcasecmp (old, new, oldlen);
249 else if (hard_LC_COLLATE)
250 return xmemcoll (old, oldlen, new, newlen) != 0;
251 else
252 return oldlen != newlen || memcmp (old, new, oldlen);
255 /* Output the line in linebuffer LINE to standard output
256 provided that the switches say it should be output.
257 MATCH is true if the line matches the previous line.
258 If requested, print the number of times it occurred, as well;
259 LINECOUNT + 1 is the number of times that the line occurred. */
261 static void
262 writeline (struct linebuffer const *line,
263 bool match, uintmax_t linecount)
265 if (! (linecount == 0 ? output_unique
266 : !match ? output_first_repeated
267 : output_later_repeated))
268 return;
270 if (countmode == count_occurrences)
271 printf ("%7" PRIuMAX " ", linecount + 1);
273 fwrite (line->buffer, sizeof (char), line->length, stdout);
276 /* Process input file INFILE with output to OUTFILE.
277 If either is "-", use the standard I/O stream for it instead. */
279 static void
280 check_file (const char *infile, const char *outfile, char delimiter)
282 struct linebuffer lb1, lb2;
283 struct linebuffer *thisline, *prevline;
285 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
286 error (EXIT_FAILURE, errno, "%s", infile);
287 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
288 error (EXIT_FAILURE, errno, "%s", outfile);
290 fadvise (stdin, FADVISE_SEQUENTIAL);
292 thisline = &lb1;
293 prevline = &lb2;
295 initbuffer (thisline);
296 initbuffer (prevline);
298 /* The duplication in the following `if' and `else' blocks is an
299 optimization to distinguish the common case (in which none of
300 the following options has been specified: --count, -repeated,
301 --all-repeated, --unique) from the others. In the common case,
302 this optimization lets uniq output each different line right away,
303 without waiting to see if the next one is different. */
305 if (output_unique && output_first_repeated && countmode == count_none)
307 char *prevfield IF_LINT ( = NULL);
308 size_t prevlen IF_LINT ( = 0);
310 while (!feof (stdin))
312 char *thisfield;
313 size_t thislen;
314 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
315 break;
316 thisfield = find_field (thisline);
317 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
318 if (prevline->length == 0
319 || different (thisfield, prevfield, thislen, prevlen))
321 fwrite (thisline->buffer, sizeof (char),
322 thisline->length, stdout);
324 SWAP_LINES (prevline, thisline);
325 prevfield = thisfield;
326 prevlen = thislen;
330 else
332 char *prevfield;
333 size_t prevlen;
334 uintmax_t match_count = 0;
335 bool first_delimiter = true;
337 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
338 goto closefiles;
339 prevfield = find_field (prevline);
340 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
342 while (!feof (stdin))
344 bool match;
345 char *thisfield;
346 size_t thislen;
347 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
349 if (ferror (stdin))
350 goto closefiles;
351 break;
353 thisfield = find_field (thisline);
354 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
355 match = !different (thisfield, prevfield, thislen, prevlen);
356 match_count += match;
358 if (match_count == UINTMAX_MAX)
360 if (count_occurrences)
361 error (EXIT_FAILURE, 0, _("too many repeated lines"));
362 match_count--;
365 if (delimit_groups != DM_NONE)
367 if (!match)
369 if (match_count) /* a previous match */
370 first_delimiter = false; /* Only used when DM_SEPARATE */
372 else if (match_count == 1)
374 if ((delimit_groups == DM_PREPEND)
375 || (delimit_groups == DM_SEPARATE
376 && !first_delimiter))
377 putchar (delimiter);
381 if (!match || output_later_repeated)
383 writeline (prevline, match, match_count);
384 SWAP_LINES (prevline, thisline);
385 prevfield = thisfield;
386 prevlen = thislen;
387 if (!match)
388 match_count = 0;
392 writeline (prevline, false, match_count);
395 closefiles:
396 if (ferror (stdin) || fclose (stdin) != 0)
397 error (EXIT_FAILURE, 0, _("error reading %s"), infile);
399 /* stdout is handled via the atexit-invoked close_stdout function. */
401 free (lb1.buffer);
402 free (lb2.buffer);
405 enum Skip_field_option_type
407 SFO_NONE,
408 SFO_OBSOLETE,
409 SFO_NEW
413 main (int argc, char **argv)
415 int optc = 0;
416 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
417 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
418 int nfiles = 0;
419 char const *file[2];
420 char delimiter = '\n'; /* change with --zero-terminated, -z */
422 file[0] = file[1] = "-";
423 initialize_main (&argc, &argv);
424 set_program_name (argv[0]);
425 setlocale (LC_ALL, "");
426 bindtextdomain (PACKAGE, LOCALEDIR);
427 textdomain (PACKAGE);
428 hard_LC_COLLATE = hard_locale (LC_COLLATE);
430 atexit (close_stdout);
432 skip_chars = 0;
433 skip_fields = 0;
434 check_chars = SIZE_MAX;
435 output_unique = output_first_repeated = true;
436 output_later_repeated = false;
437 countmode = count_none;
438 delimit_groups = DM_NONE;
440 while (true)
442 /* Parse an operand with leading "+" as a file after "--" was
443 seen; or if pedantic and a file was seen; or if not
444 obsolete. */
446 if (optc == -1
447 || (posixly_correct && nfiles != 0)
448 || ((optc = getopt_long (argc, argv,
449 "-0123456789Dcdf:is:uw:z", longopts, NULL))
450 == -1))
452 if (argc <= optind)
453 break;
454 if (nfiles == 2)
456 error (0, 0, _("extra operand %s"), quote (argv[optind]));
457 usage (EXIT_FAILURE);
459 file[nfiles++] = argv[optind++];
461 else switch (optc)
463 case 1:
465 unsigned long int size;
466 if (optarg[0] == '+'
467 && posix2_version () < 200112
468 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
469 && size <= SIZE_MAX)
470 skip_chars = size;
471 else if (nfiles == 2)
473 error (0, 0, _("extra operand %s"), quote (optarg));
474 usage (EXIT_FAILURE);
476 else
477 file[nfiles++] = optarg;
479 break;
481 case '0':
482 case '1':
483 case '2':
484 case '3':
485 case '4':
486 case '5':
487 case '6':
488 case '7':
489 case '8':
490 case '9':
492 if (skip_field_option_type == SFO_NEW)
493 skip_fields = 0;
495 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
496 skip_fields = SIZE_MAX;
498 skip_field_option_type = SFO_OBSOLETE;
500 break;
502 case 'c':
503 countmode = count_occurrences;
504 break;
506 case 'd':
507 output_unique = false;
508 break;
510 case 'D':
511 output_unique = false;
512 output_later_repeated = true;
513 if (optarg == NULL)
514 delimit_groups = DM_NONE;
515 else
516 delimit_groups = XARGMATCH ("--all-repeated", optarg,
517 delimit_method_string,
518 delimit_method_map);
519 break;
521 case 'f':
522 skip_field_option_type = SFO_NEW;
523 skip_fields = size_opt (optarg,
524 N_("invalid number of fields to skip"));
525 break;
527 case 'i':
528 ignore_case = true;
529 break;
531 case 's':
532 skip_chars = size_opt (optarg,
533 N_("invalid number of bytes to skip"));
534 break;
536 case 'u':
537 output_first_repeated = false;
538 break;
540 case 'w':
541 check_chars = size_opt (optarg,
542 N_("invalid number of bytes to compare"));
543 break;
545 case 'z':
546 delimiter = '\0';
547 break;
549 case_GETOPT_HELP_CHAR;
551 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
553 default:
554 usage (EXIT_FAILURE);
558 if (countmode == count_occurrences && output_later_repeated)
560 error (0, 0,
561 _("printing all duplicated lines and repeat counts is meaningless"));
562 usage (EXIT_FAILURE);
565 check_file (file[0], file[1], delimiter);
567 exit (EXIT_SUCCESS);