(do_link): Produce the same sort of one-line output for
[coreutils.git] / src / uniq.c
blob3639d93155fe75377dbe2374515e0376f90082a9
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 86, 91, 1995-1998, 1999 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Richard Stallman and David MacKenzie. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <getopt.h>
24 #include <sys/types.h>
26 #include "system.h"
27 #include "linebuffer.h"
28 #include "error.h"
29 #include "xstrtol.h"
30 #include "memcasecmp.h"
32 /* The official name of this program (e.g., no `g' prefix). */
33 #define PROGRAM_NAME "uniq"
35 #define AUTHORS "Richard Stallman and David MacKenzie"
37 #define SWAP_LINES(A, B) \
38 do \
39 { \
40 struct linebuffer *_tmp; \
41 _tmp = (A); \
42 (A) = (B); \
43 (B) = _tmp; \
44 } \
45 while (0)
47 /* The name this program was run with. */
48 char *program_name;
50 /* Number of fields to skip on each line when doing comparisons. */
51 static int skip_fields;
53 /* Number of chars to skip after skipping any fields. */
54 static int skip_chars;
56 /* Number of chars to compare; if 0, compare the whole lines. */
57 static int check_chars;
59 enum countmode
61 count_occurrences, /* -c Print count before output lines. */
62 count_none /* Default. Do not print counts. */
65 /* Whether and how to precede the output lines with a count of the number of
66 times they occurred in the input. */
67 static enum countmode countmode;
69 enum output_mode
71 output_repeated, /* -d Only lines that are repeated. */
72 output_all_repeated, /* -D All lines that are repeated. */
73 output_unique, /* -u Only lines that are not repeated. */
74 output_all /* Default. Print first copy of each line. */
77 /* Which lines to output. */
78 static enum output_mode mode;
80 /* If nonzero, ignore case when comparing. */
81 static int ignore_case;
83 static struct option const longopts[] =
85 {"count", no_argument, NULL, 'c'},
86 {"repeated", no_argument, NULL, 'd'},
87 {"all-repeated", no_argument, NULL, 'D'},
88 {"ignore-case", no_argument, NULL, 'i'},
89 {"unique", no_argument, NULL, 'u'},
90 {"skip-fields", required_argument, NULL, 'f'},
91 {"skip-chars", required_argument, NULL, 's'},
92 {"check-chars", required_argument, NULL, 'w'},
93 {GETOPT_HELP_OPTION_DECL},
94 {GETOPT_VERSION_OPTION_DECL},
95 {NULL, 0, NULL, 0}
98 void
99 usage (int status)
101 if (status != 0)
102 fprintf (stderr, _("Try `%s --help' for more information.\n"),
103 program_name);
104 else
106 printf (_("\
107 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
109 program_name);
110 printf (_("\
111 Discard all but one of successive identical lines from INPUT (or\n\
112 standard input), writing to OUTPUT (or standard output).\n\
114 -c, --count prefix lines by the number of occurrences\n\
115 -d, --repeated only print duplicate lines\n\
116 -D, --all-repeated print all duplicate lines\n\
117 -f, --skip-fields=N avoid comparing the first N fields\n\
118 -i, --ignore-case ignore differences in case when comparing\n\
119 -s, --skip-chars=N avoid comparing the first N characters\n\
120 -u, --unique only print unique lines\n\
121 -w, --check-chars=N compare no more than N characters in lines\n\
122 -N same as -f N\n\
123 +N same as -s N\n\
124 --help display this help and exit\n\
125 --version output version information and exit\n\
127 A field is a run of whitespace, then non-whitespace characters.\n\
128 Fields are skipped before chars.\n\
129 "));
130 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
132 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
135 /* Given a linebuffer LINE,
136 return a pointer to the beginning of the line's field to be compared. */
138 static char *
139 find_field (const struct linebuffer *line)
141 register int count;
142 register char *lp = line->buffer;
143 register size_t size = line->length;
144 register size_t i = 0;
146 for (count = 0; count < skip_fields && i < size; count++)
148 while (i < size && ISBLANK (lp[i]))
149 i++;
150 while (i < size && !ISBLANK (lp[i]))
151 i++;
154 for (count = 0; count < skip_chars && i < size; count++)
155 i++;
157 return lp + i;
160 /* Return zero if two strings OLD and NEW match, nonzero if not.
161 OLD and NEW point not to the beginnings of the lines
162 but rather to the beginnings of the fields to compare.
163 OLDLEN and NEWLEN are their lengths. */
165 static int
166 different (const char *old, const char *new, size_t oldlen, size_t newlen)
168 register int order;
170 if (check_chars)
172 if (oldlen > check_chars)
173 oldlen = check_chars;
174 if (newlen > check_chars)
175 newlen = check_chars;
178 /* Use an if-statement here rather than a function variable to
179 avoid portability hassles of getting a non-conflicting declaration
180 of memcmp. */
181 if (ignore_case)
182 order = memcasecmp (old, new, MIN (oldlen, newlen));
183 else
184 order = memcmp (old, new, MIN (oldlen, newlen));
186 if (order == 0)
187 return oldlen - newlen;
188 return order;
191 /* Output the line in linebuffer LINE to stream STREAM
192 provided that the switches say it should be output.
193 If requested, print the number of times it occurred, as well;
194 LINECOUNT + 1 is the number of times that the line occurred. */
196 static void
197 writeline (const struct linebuffer *line, FILE *stream, int linecount)
199 if ((mode == output_unique && linecount != 0)
200 || (mode == output_repeated && linecount == 0)
201 || (mode == output_all_repeated && linecount == 0))
202 return;
204 if (countmode == count_occurrences)
205 fprintf (stream, "%7d\t", linecount + 1);
207 fwrite (line->buffer, sizeof (char), line->length, stream);
210 /* Process input file INFILE with output to OUTFILE.
211 If either is "-", use the standard I/O stream for it instead. */
213 static void
214 check_file (const char *infile, const char *outfile)
216 FILE *istream;
217 FILE *ostream;
218 struct linebuffer lb1, lb2;
219 struct linebuffer *thisline, *prevline;
221 if (STREQ (infile, "-"))
222 istream = stdin;
223 else
224 istream = fopen (infile, "r");
225 if (istream == NULL)
226 error (EXIT_FAILURE, errno, "%s", infile);
228 if (STREQ (outfile, "-"))
229 ostream = stdout;
230 else
231 ostream = fopen (outfile, "w");
232 if (ostream == NULL)
233 error (EXIT_FAILURE, errno, "%s", outfile);
235 thisline = &lb1;
236 prevline = &lb2;
238 initbuffer (thisline);
239 initbuffer (prevline);
241 /* The duplication in the following `if' and `else' blocks is an
242 optimization to distinguish the common case (in which none of
243 the following options has been specified: --count, -repeated,
244 --all-repeated, --unique) from the others. In the common case,
245 this optimization lets uniq output each different line right away,
246 without waiting to see if the next one is different. */
248 if (mode == output_all && countmode == count_none)
250 char *prevfield IF_LINT (= NULL);
251 size_t prevlen IF_LINT (= 0);
253 while (!feof (istream))
255 char *thisfield;
256 size_t thislen;
257 if (readline (thisline, istream) == 0)
258 break;
259 thisfield = find_field (thisline);
260 thislen = thisline->length - (thisfield - thisline->buffer);
261 if (prevline->length == 0
262 || different (thisfield, prevfield, thislen, prevlen))
264 fwrite (thisline->buffer, sizeof (char),
265 thisline->length, ostream);
267 SWAP_LINES (prevline, thisline);
268 prevfield = thisfield;
269 prevlen = thislen;
273 else
275 char *prevfield;
276 size_t prevlen;
277 int match_count = 0;
279 if (readline (prevline, istream) == 0)
280 goto closefiles;
281 prevfield = find_field (prevline);
282 prevlen = prevline->length - (prevfield - prevline->buffer);
284 while (!feof (istream))
286 int match;
287 char *thisfield;
288 size_t thislen;
289 if (readline (thisline, istream) == 0)
290 break;
291 thisfield = find_field (thisline);
292 thislen = thisline->length - (thisfield - thisline->buffer);
293 match = !different (thisfield, prevfield, thislen, prevlen);
295 if (match)
296 ++match_count;
298 if (!match || mode == output_all_repeated)
300 writeline (prevline, ostream, match_count);
301 SWAP_LINES (prevline, thisline);
302 prevfield = thisfield;
303 prevlen = thislen;
304 if (!match)
305 match_count = 0;
309 writeline (prevline, ostream, match_count);
312 closefiles:
313 if (ferror (istream) || fclose (istream) == EOF)
314 error (EXIT_FAILURE, errno, _("error reading %s"), infile);
316 if (ferror (ostream) || fclose (ostream) == EOF)
317 error (EXIT_FAILURE, errno, _("error writing %s"), outfile);
319 free (lb1.buffer);
320 free (lb2.buffer);
324 main (int argc, char **argv)
326 int optc;
327 char *infile = "-", *outfile = "-";
329 program_name = argv[0];
330 setlocale (LC_ALL, "");
331 bindtextdomain (PACKAGE, LOCALEDIR);
332 textdomain (PACKAGE);
334 skip_chars = 0;
335 skip_fields = 0;
336 check_chars = 0;
337 mode = output_all;
338 countmode = count_none;
340 while ((optc = getopt_long (argc, argv, "0123456789cdDf:is:uw:", longopts,
341 NULL)) != -1)
343 switch (optc)
345 case 0:
346 break;
348 case '0':
349 case '1':
350 case '2':
351 case '3':
352 case '4':
353 case '5':
354 case '6':
355 case '7':
356 case '8':
357 case '9':
358 skip_fields = skip_fields * 10 + optc - '0';
359 break;
361 case 'c':
362 countmode = count_occurrences;
363 break;
365 case 'd':
366 mode = output_repeated;
367 break;
369 case 'D':
370 mode = output_all_repeated;
371 break;
373 case 'f': /* Like '-#'. */
375 long int tmp_long;
376 if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
377 || tmp_long <= 0 || tmp_long > INT_MAX)
378 error (EXIT_FAILURE, 0,
379 _("invalid number of fields to skip: `%s'"),
380 optarg);
381 skip_fields = (int) tmp_long;
383 break;
385 case 'i':
386 ignore_case = 1;
387 break;
389 case 's': /* Like '+#'. */
391 long int tmp_long;
392 if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
393 || tmp_long <= 0 || tmp_long > INT_MAX)
394 error (EXIT_FAILURE, 0,
395 _("invalid number of bytes to skip: `%s'"),
396 optarg);
397 skip_chars = (int) tmp_long;
399 break;
401 case 'u':
402 mode = output_unique;
403 break;
405 case 'w':
407 long int tmp_long;
408 if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
409 || tmp_long <= 0 || tmp_long > INT_MAX)
410 error (EXIT_FAILURE, 0,
411 _("invalid number of bytes to compare: `%s'"),
412 optarg);
413 check_chars = (int) tmp_long;
415 break;
417 case_GETOPT_HELP_CHAR;
419 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
421 default:
422 usage (1);
426 if (optind >= 2 && !STREQ (argv[optind - 1], "--"))
428 /* Interpret non-option arguments with leading `+' only
429 if we haven't seen `--'. */
430 while (optind < argc && argv[optind][0] == '+')
432 char *opt_str = argv[optind++];
433 long int tmp_long;
434 if (xstrtol (opt_str, NULL, 10, &tmp_long, "") != LONGINT_OK
435 || tmp_long <= 0 || tmp_long > INT_MAX)
436 error (EXIT_FAILURE, 0,
437 _("invalid number of bytes to compare: `%s'"),
438 opt_str);
439 skip_chars = (int) tmp_long;
443 if (optind < argc)
444 infile = argv[optind++];
446 if (optind < argc)
447 outfile = argv[optind++];
449 if (optind < argc)
451 error (0, 0, _("too many arguments"));
452 usage (1);
455 if (countmode == count_occurrences && mode == output_all_repeated)
457 error (0, 0,
458 _("printing all duplicated lines and repeat counts is meaningless"));
459 usage (1);
462 check_file (infile, outfile);
464 exit (EXIT_SUCCESS);