*** empty log message ***
[coreutils.git] / src / uniq.c
blob1dd037a40a8c241670eb7a2c0dbbf6261b06dad1
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 86, 91, 1995-2002, Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Richard Stallman and David MacKenzie. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include <sys/types.h>
26 #include "system.h"
27 #include "closeout.h"
28 #include "argmatch.h"
29 #include "linebuffer.h"
30 #include "error.h"
31 #include "posixver.h"
32 #include "xstrtol.h"
33 #include "memcasecmp.h"
35 /* The official name of this program (e.g., no `g' prefix). */
36 #define PROGRAM_NAME "uniq"
38 #define AUTHORS N_ ("Richard Stallman and David MacKenzie")
40 #define SWAP_LINES(A, B) \
41 do \
42 { \
43 struct linebuffer *_tmp; \
44 _tmp = (A); \
45 (A) = (B); \
46 (B) = _tmp; \
47 } \
48 while (0)
50 /* The name this program was run with. */
51 char *program_name;
53 /* Number of fields to skip on each line when doing comparisons. */
54 static size_t skip_fields;
56 /* Number of chars to skip after skipping any fields. */
57 static size_t skip_chars;
59 /* Number of chars to compare. */
60 static size_t check_chars;
62 enum countmode
64 count_occurrences, /* -c Print count before output lines. */
65 count_none /* Default. Do not print counts. */
68 /* Whether and how to precede the output lines with a count of the number of
69 times they occurred in the input. */
70 static enum countmode countmode;
72 enum output_mode
74 output_repeated, /* -d Only lines that are repeated. */
75 output_all_repeated, /* -D All lines that are repeated. */
76 output_unique, /* -u Only lines that are not repeated. */
77 output_all /* Default. Print first copy of each line. */
80 /* Which lines to output. */
81 static enum output_mode mode;
83 /* If nonzero, ignore case when comparing. */
84 static int ignore_case;
86 enum delimit_method
88 /* No delimiters output. --all-repeated[=none] */
89 DM_NONE,
91 /* Delimiter precedes all groups. --all-repeated=prepend */
92 DM_PREPEND,
94 /* Delimit all groups. --all-repeated=separate */
95 DM_SEPARATE
98 static char const *const delimit_method_string[] =
100 "none", "prepend", "separate", 0
103 static enum delimit_method const delimit_method_map[] =
105 DM_NONE, DM_PREPEND, DM_SEPARATE
108 /* Select whether/how to delimit groups of duplicate lines. */
109 static enum delimit_method delimit_groups;
111 static struct option const longopts[] =
113 {"count", no_argument, NULL, 'c'},
114 {"repeated", no_argument, NULL, 'd'},
115 {"all-repeated", optional_argument, NULL, 'D'},
116 {"ignore-case", no_argument, NULL, 'i'},
117 {"unique", no_argument, NULL, 'u'},
118 {"skip-fields", required_argument, NULL, 'f'},
119 {"skip-chars", required_argument, NULL, 's'},
120 {"check-chars", required_argument, NULL, 'w'},
121 {GETOPT_HELP_OPTION_DECL},
122 {GETOPT_VERSION_OPTION_DECL},
123 {NULL, 0, NULL, 0}
126 void
127 usage (int status)
129 if (status != 0)
130 fprintf (stderr, _("Try `%s --help' for more information.\n"),
131 program_name);
132 else
134 printf (_("\
135 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
137 program_name);
138 fputs (_("\
139 Discard all but one of successive identical lines from INPUT (or\n\
140 standard input), writing to OUTPUT (or standard output).\n\
142 "), stdout);
143 fputs (_("\
144 Mandatory arguments to long options are mandatory for short options too.\n\
145 "), stdout);
146 fputs (_("\
147 -c, --count prefix lines by the number of occurrences\n\
148 -d, --repeated only print duplicate lines\n\
149 "), stdout);
150 fputs (_("\
151 -D, --all-repeated[=delimit-method] print all duplicate lines\n\
152 delimit-method={none(default),prepend,separate}\n\
153 Delimiting is done with blank lines.\n\
154 -f, --skip-fields=N avoid comparing the first N fields\n\
155 -i, --ignore-case ignore differences in case when comparing\n\
156 -s, --skip-chars=N avoid comparing the first N characters\n\
157 -u, --unique only print unique lines\n\
158 "), stdout);
159 fputs (_("\
160 -w, --check-chars=N compare no more than N characters in lines\n\
161 "), stdout);
162 fputs (HELP_OPTION_DESCRIPTION, stdout);
163 fputs (VERSION_OPTION_DESCRIPTION, stdout);
164 fputs (_("\
166 A field is a run of whitespace, then non-whitespace characters.\n\
167 Fields are skipped before chars.\n\
168 "), stdout);
169 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
171 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
174 /* Convert OPT to size_t, reporting an error using MSGID if it does
175 not fit. */
177 static size_t
178 size_opt (char const *opt, char const *msgid)
180 unsigned long int size;
181 if (xstrtoul (opt, NULL, 10, &size, "") != LONGINT_OK
182 || SIZE_MAX < size)
183 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
184 return size;
187 /* Given a linebuffer LINE,
188 return a pointer to the beginning of the line's field to be compared. */
190 static char *
191 find_field (const struct linebuffer *line)
193 register size_t count;
194 register char *lp = line->buffer;
195 register size_t size = line->length - 1;
196 register size_t i = 0;
198 for (count = 0; count < skip_fields && i < size; count++)
200 while (i < size && ISBLANK (lp[i]))
201 i++;
202 while (i < size && !ISBLANK (lp[i]))
203 i++;
206 for (count = 0; count < skip_chars && i < size; count++)
207 i++;
209 return lp + i;
212 /* Return zero if two strings OLD and NEW match, nonzero if not.
213 OLD and NEW point not to the beginnings of the lines
214 but rather to the beginnings of the fields to compare.
215 OLDLEN and NEWLEN are their lengths. */
217 static int
218 different (const char *old, const char *new, size_t oldlen, size_t newlen)
220 if (check_chars < oldlen)
221 oldlen = check_chars;
222 if (check_chars < newlen)
223 newlen = check_chars;
225 if (oldlen != newlen)
226 return 1;
228 /* Use an if-statement here rather than a function variable to
229 avoid portability hassles of getting a non-conflicting declaration
230 of memcmp. */
231 if (ignore_case)
232 return memcasecmp (old, new, oldlen);
233 else
234 return memcmp (old, new, oldlen);
237 /* Output the line in linebuffer LINE to stream STREAM
238 provided that the switches say it should be output.
239 If requested, print the number of times it occurred, as well;
240 LINECOUNT + 1 is the number of times that the line occurred. */
242 static void
243 writeline (const struct linebuffer *line, FILE *stream, int linecount)
245 if ((mode == output_unique && linecount != 0)
246 || (mode == output_repeated && linecount == 0)
247 || (mode == output_all_repeated && linecount == 0))
248 return;
250 if (countmode == count_occurrences)
251 fprintf (stream, "%7d\t", linecount + 1);
253 fwrite (line->buffer, sizeof (char), line->length, stream);
256 /* Process input file INFILE with output to OUTFILE.
257 If either is "-", use the standard I/O stream for it instead. */
259 static void
260 check_file (const char *infile, const char *outfile)
262 FILE *istream;
263 FILE *ostream;
264 struct linebuffer lb1, lb2;
265 struct linebuffer *thisline, *prevline;
267 if (STREQ (infile, "-"))
268 istream = stdin;
269 else
270 istream = fopen (infile, "r");
271 if (istream == NULL)
272 error (EXIT_FAILURE, errno, "%s", infile);
274 if (STREQ (outfile, "-"))
275 ostream = stdout;
276 else
277 ostream = fopen (outfile, "w");
278 if (ostream == NULL)
279 error (EXIT_FAILURE, errno, "%s", outfile);
281 thisline = &lb1;
282 prevline = &lb2;
284 initbuffer (thisline);
285 initbuffer (prevline);
287 /* The duplication in the following `if' and `else' blocks is an
288 optimization to distinguish the common case (in which none of
289 the following options has been specified: --count, -repeated,
290 --all-repeated, --unique) from the others. In the common case,
291 this optimization lets uniq output each different line right away,
292 without waiting to see if the next one is different. */
294 if (mode == output_all && countmode == count_none)
296 char *prevfield IF_LINT (= NULL);
297 size_t prevlen IF_LINT (= 0);
299 while (!feof (istream))
301 char *thisfield;
302 size_t thislen;
303 if (readline (thisline, istream) == 0)
304 break;
305 thisfield = find_field (thisline);
306 thislen = thisline->length - (thisfield - thisline->buffer);
307 if (prevline->length == 0
308 || different (thisfield, prevfield, thislen, prevlen))
310 fwrite (thisline->buffer, sizeof (char),
311 thisline->length, ostream);
313 SWAP_LINES (prevline, thisline);
314 prevfield = thisfield;
315 prevlen = thislen;
319 else
321 char *prevfield;
322 size_t prevlen;
323 int match_count = 0;
324 int first_delimiter = 1;
326 if (readline (prevline, istream) == 0)
327 goto closefiles;
328 prevfield = find_field (prevline);
329 prevlen = prevline->length - (prevfield - prevline->buffer);
331 while (!feof (istream))
333 int match;
334 char *thisfield;
335 size_t thislen;
336 if (readline (thisline, istream) == 0)
337 break;
338 thisfield = find_field (thisline);
339 thislen = thisline->length - (thisfield - thisline->buffer);
340 match = !different (thisfield, prevfield, thislen, prevlen);
342 if (match)
343 ++match_count;
345 if (mode == output_all_repeated && delimit_groups != DM_NONE)
347 if (!match)
349 if (match_count) /* a previous match */
350 first_delimiter = 0; /* Only used when DM_SEPARATE */
352 else if (match_count == 1)
354 if ((delimit_groups == DM_PREPEND)
355 || (delimit_groups == DM_SEPARATE
356 && !first_delimiter))
357 putc ('\n', ostream);
361 if (!match || mode == output_all_repeated)
363 writeline (prevline, ostream, match_count);
364 SWAP_LINES (prevline, thisline);
365 prevfield = thisfield;
366 prevlen = thislen;
367 if (!match)
368 match_count = 0;
372 writeline (prevline, ostream, match_count);
375 closefiles:
376 if (ferror (istream) || fclose (istream) == EOF)
377 error (EXIT_FAILURE, errno, _("error reading %s"), infile);
379 /* Close ostream only if it's not stdout -- the latter is closed
380 via the atexit-invoked close_stdout. */
381 if (ostream != stdout && (ferror (ostream) || fclose (ostream) == EOF))
382 error (EXIT_FAILURE, errno, _("error writing %s"), outfile);
384 free (lb1.buffer);
385 free (lb2.buffer);
389 main (int argc, char **argv)
391 int optc = 0;
392 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
393 bool obsolete_skip_fields = false;
394 int nfiles = 0;
395 char const *file[2];
397 file[0] = file[1] = "-";
398 program_name = argv[0];
399 setlocale (LC_ALL, "");
400 bindtextdomain (PACKAGE, LOCALEDIR);
401 textdomain (PACKAGE);
403 atexit (close_stdout);
405 skip_chars = 0;
406 skip_fields = 0;
407 check_chars = SIZE_MAX;
408 mode = output_all;
409 countmode = count_none;
410 delimit_groups = DM_NONE;
412 for (;;)
414 /* Parse an operand with leading "+" as a file after "--" was
415 seen; or if pedantic and a file was seen; or if not
416 obsolete. */
418 if (optc == -1
419 || (posixly_correct && nfiles != 0)
420 || ((optc = getopt_long (argc, argv,
421 "-0123456789Dcdf:is:uw:", longopts, NULL))
422 == -1))
424 if (optind == argc)
425 break;
426 if (nfiles == 2)
428 error (0, 0, _("extra operand `%s'"), argv[optind]);
429 usage (1);
431 file[nfiles++] = argv[optind++];
433 else switch (optc)
435 case 1:
437 unsigned long int size;
438 if (optarg[0] == '+'
439 && posix2_version () < 200112
440 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
441 && size <= SIZE_MAX)
442 skip_chars = size;
443 else if (nfiles == 2)
445 error (0, 0, _("extra operand `%s'"), optarg);
446 usage (1);
448 else
449 file[nfiles++] = optarg;
451 break;
453 case '0':
454 case '1':
455 case '2':
456 case '3':
457 case '4':
458 case '5':
459 case '6':
460 case '7':
461 case '8':
462 case '9':
464 size_t s = skip_fields;
465 skip_fields = s * 10 + optc - '0';
466 if (SIZE_MAX / 10 < s || skip_fields < s)
467 error (EXIT_FAILURE, 0, "%s",
468 _("invalid number of fields to skip"));
469 obsolete_skip_fields = true;
471 break;
473 case 'c':
474 countmode = count_occurrences;
475 break;
477 case 'd':
478 mode = output_repeated;
479 break;
481 case 'D':
482 mode = output_all_repeated;
483 if (optarg == NULL)
484 delimit_groups = DM_NONE;
485 else
486 delimit_groups = XARGMATCH ("--all-repeated", optarg,
487 delimit_method_string,
488 delimit_method_map);
489 break;
491 case 'f': /* Like '-#'. */
492 skip_fields = size_opt (optarg,
493 N_("invalid number of fields to skip"));
494 break;
496 case 'i':
497 ignore_case = 1;
498 break;
500 case 's': /* Like '+#'. */
501 skip_chars = size_opt (optarg,
502 N_("invalid number of bytes to skip"));
503 break;
505 case 'u':
506 mode = output_unique;
507 break;
509 case 'w':
510 check_chars = size_opt (optarg,
511 N_("invalid number of bytes to compare"));
512 break;
514 case_GETOPT_HELP_CHAR;
516 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
518 default:
519 usage (1);
523 if (obsolete_skip_fields && 200112 <= posix2_version ())
525 error (0, 0, _("`-%lu' option is obsolete; use `-f %lu'"),
526 (unsigned long) skip_fields, (unsigned long) skip_fields);
527 usage (EXIT_FAILURE);
530 if (countmode == count_occurrences && mode == output_all_repeated)
532 error (0, 0,
533 _("printing all duplicated lines and repeat counts is meaningless"));
534 usage (1);
537 check_file (file[0], file[1]);
539 exit (EXIT_SUCCESS);