1 /***********************************************************************
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
13 * Information and Software Systems Research *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
20 ***********************************************************************/
25 * Written by David Korn
28 static const char usage
[] =
29 "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
31 "[+NAME?uniq - Report or filter out repeated lines in a file]"
32 "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
33 "writes one copy of each input line on the output. The second "
34 "and succeeding copies of the repeated adjacent lines are not "
36 "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37 "to standard output. If no \ainfile\a is given, or if the \ainfile\a "
38 "is \b-\b, \buniq\b reads from standard input with the start of "
39 "the file defined as the current offset.]"
40 "[c:count?Output the number of times each line occurred along with "
42 "[d:repeated|duplicates?Output the first of each duplicate line.]"
43 "[D:all-repeated?Output all duplicate lines as a group with an empty "
44 "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
46 "[n:none?Do not delimit duplicate groups.]"
47 "[p:prepend?Prepend an empty line before each group.]"
48 "[s:separate?Separate each group with an empty line.]"
50 "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51 "before checking for uniqueness. A field is the minimal string matching "
52 "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
53 "\b--skip-fields\b=\anumber\a.]"
54 "[i:ignore-case?Ignore case in comparisons.]"
55 "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
56 "before checking for uniqueness. If specified along with \b-f\b, "
57 "the first \achars\a after the first \afields\a are ignored. If "
58 "the \achars\a specifies more characters than are on the line, "
59 "an empty string will be used for comparison. +\anumber\a is "
60 "equivalent to \b--skip-chars\b=\anumber\a.]"
61 "[u:unique?Output unique lines.]"
62 "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
63 "after skipping any specified fields and characters.]"
65 "\n[infile [outfile]]\n"
68 "[+0?The input file was successfully processed.]"
69 "[+>0?An error occurred.]"
71 "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
83 typedef int (*Compare_f
)(const char*, const char*, size_t);
85 static int uniq(Sfio_t
*fdin
, Sfio_t
*fdout
, int fields
, int chars
, int width
, int mode
, int* all
, Compare_f compare
)
87 register int n
, f
, outsize
=0, mb
= mbwide();
88 register char *cp
, *ep
, *mp
, *bufp
, *outp
;
89 char *orecp
, *sbufp
=0, *outbuff
;
90 int reclen
,oreclen
= -1,count
=0,cwidth
=0,sep
,next
;
95 if(bufp
= sfgetr(fdin
,'\n',0))
97 else if(bufp
= sfgetr(fdin
,'\n',SF_LASTR
))
100 bufp
= memcpy(fmtbuf(n
+ 1), bufp
, n
);
110 while (f
-->0 && cp
<ep
) /* skip over fields */
112 while (cp
<ep
&& *cp
==' ' || *cp
=='\t')
114 while (cp
<ep
&& *cp
!=' ' && *cp
!='\t')
120 for (f
= chars
; f
; f
--)
125 if ((reclen
= n
- (cp
- bufp
)) <= 0)
130 else if (width
>= 0 && width
< reclen
)
136 while (reclen
< width
&& mp
< ep
)
149 if(reclen
==oreclen
&& (!reclen
|| !(*compare
)(cp
,orecp
,reclen
)))
161 if(((mode
&D_FLAG
)&&count
==0) || ((mode
&U_FLAG
)&&count
))
164 sfwrite(fdout
,outp
,0);
175 outp
[f
++] = '0' + count
+ 1;
178 else if(count
<MAXCNT
)
185 outp
[f
--] = '0' + (count
% 10);
186 } while (count
/= 10);
192 outsize
-= (CWIDTH
+1);
195 if(!(sbufp
=fmtbuf(outsize
)))
197 memcpy(sbufp
,outp
+CWIDTH
+1,outsize
);
198 sfwrite(fdout
,outp
,0);
203 sfprintf(fdout
,"%4d ",count
+1);
206 if(sfwrite(fdout
,outp
,outsize
) != outsize
)
215 if(sfwrite(fdout
,outp
,outsize
) != outsize
)
222 sep
= all
&& *all
> 0;
223 /* save current record */
224 if (!(outbuff
= sfreserve(fdout
, 0, 0)) || (outsize
= sfvalue(fdout
)) < 0)
227 if(outsize
< n
+cwidth
+sep
)
229 /* no room in outp, clear lock and use side buffer */
230 sfwrite(fdout
,outp
,0);
231 if(!(sbufp
= outp
=fmtbuf(outsize
=n
+cwidth
+sep
)))
235 outsize
= n
+cwidth
+sep
;
236 memcpy(outp
+cwidth
+sep
,bufp
,n
);
240 orecp
= outp
+cwidth
+sep
+ (cp
-bufp
);
246 b_uniq(int argc
, char** argv
, void* context
)
248 register int n
, mode
=0;
250 int fields
=0, chars
=0, width
=-1;
251 Sfio_t
*fpin
, *fpout
;
254 Compare_f compare
= (Compare_f
)memcmp
;
256 cmdinit(argc
, argv
, context
, ERROR_CATALOG
, 0);
257 while (n
= optget(argv
, usage
)) switch (n
)
267 switch ((int)opt_info
.num
)
282 compare
= (Compare_f
)strncasecmp
;
288 if(*opt_info
.option
=='-')
289 fields
= opt_info
.num
;
291 chars
= opt_info
.num
;
294 chars
= opt_info
.num
;
297 width
= opt_info
.num
;
300 error(2, "%s", opt_info
.arg
);
303 error(ERROR_usage(2), "%s", opt_info
.arg
);
306 argv
+= opt_info
.index
;
307 if(all
&& (mode
&C_FLAG
))
308 error(2, "-c and -D are mutually exclusive");
309 if(error_info
.errors
)
310 error(ERROR_usage(2), "%s", optusage(NiL
));
311 if((cp
= *argv
) && (argv
++,!streq(cp
,"-")))
313 if(!(fpin
= sfopen(NiL
,cp
,"r")))
314 error(ERROR_system(1),"%s: cannot open",cp
);
321 if(!(fpout
= sfopen(NiL
,cp
,"w")))
322 error(ERROR_system(1),"%s: cannot create",cp
);
328 error(2, "too many arguments");
329 error(ERROR_usage(2), "%s", optusage(NiL
));
331 error_info
.errors
= uniq(fpin
,fpout
,fields
,chars
,width
,mode
,all
,compare
);
336 return(error_info
.errors
);