1 /***********************************************************************
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
13 * Information and Software Systems Research *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
20 ***********************************************************************/
24 * AT&T Bell Laboratories
26 * cut fields or columns from fields from a file
29 static const char usage
[] =
30 "[-?\n@(#)$Id: cut (AT&T Research) 2009-12-04 $\n]"
32 "[+NAME?cut - cut out selected columns or fields of each line of a file]"
33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34 "from one or more files, contatenating them on standard output.]"
35 "[+?The option argument \alist\a is a comma-separated or blank-separated "
36 "list of positive numbers and ranges. Ranges can be of three "
37 "forms. The first is two positive integers separated by a hyphen "
38 "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39 "\ahigh\a. The second is a positive number preceded by a hyphen "
40 "(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41 "\ahigh\a. The last is a positive number followed by a hyphen "
42 "(\alow\a\b-\b), which represents all fields from \alow\a to the "
43 "last field, inclusive. Elements in the \alist\a can be repeated, "
44 "can overlap, and can appear in any order. The order of the "
45 "output is that of the input.]"
46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48 "cuts from standard input. The start of the file is defined "
49 "as the current offset.]"
50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51 "[c:characters]:[list?\bcut\b based on a list of character counts.]"
52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53 "to \adelim\a. The default is the \btab\b character.]"
54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55 "character specified with the \b-d\b optiion.]"
56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58 "records of length \areclen\a when used with the \b-b\b or \b-c\b "
60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61 "when used with the \b-f\b option. By default, lines with no "
62 "delimiters will be passsed in untouched.]"
63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64 "the \b-f\b option is set to \aldelim\a. The default is the "
65 "\bnewline\b character.]"
66 "[N!:newline?Output new-lines at end of each record when used "
67 "with the \b-b\b or \b-c\b option.]"
72 "[+0?All files processed successfully.]"
73 "[+>0?One or more files failed to open or could not be read.]"
75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
81 typedef struct Delim_s
99 unsigned char space
[UCHAR_MAX
+1];
100 int list
[2]; /* NOTE: must be last member */
110 #define C_NONEWLINE 32
116 #define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
119 * compare the first of an array of integers
123 mycomp(register const void* a
, register const void* b
)
125 if (*((int*)a
) < *((int*)b
))
127 if (*((int*)a
) > *((int*)b
))
133 cutinit(int mode
, char* str
, Delim_t
* wdelim
, Delim_t
* ldelim
, size_t reclen
)
138 register int range
= 0;
139 register char* cp
= str
;
142 if (!(cut
= (Cut_t
*)stakalloc(sizeof(Cut_t
) + strlen(cp
) * sizeof(int))))
143 error(ERROR_exit(1), "out of space");
144 if (cut
->mb
= mbwide())
146 memset(cut
->space
, 0, sizeof(cut
->space
) / 2);
147 memset(cut
->space
+ sizeof(cut
->space
) / 2, SP_WIDE
, sizeof(cut
->space
) / 2);
150 memset(cut
->space
, 0, sizeof(cut
->space
));
151 cut
->wdelim
= *wdelim
;
152 if (wdelim
->len
== 1)
153 cut
->space
[wdelim
->chr
] = SP_WORD
;
154 cut
->ldelim
= *ldelim
;
155 cut
->eob
= (ldelim
->len
== 1) ? ldelim
->chr
: 0;
156 cut
->space
[cut
->eob
] = SP_LINE
;
157 cut
->cflag
= (mode
&C_CHARS
) && cut
->mb
;
158 cut
->nosplit
= (mode
&(C_BYTES
|C_NOSPLIT
)) == (C_BYTES
|C_NOSPLIT
) && cut
->mb
;
159 cut
->sflag
= (mode
&C_SUPRESS
) != 0;
160 cut
->nlflag
= (mode
&C_NONEWLINE
) != 0;
161 cut
->reclen
= reclen
;
168 while(*cp
==' ' || *cp
=='\t')
176 if((n
= (n
? (n
-range
) : (HUGE
-1))) < 0)
177 error(ERROR_exit(1),"invalid range for c/f option");
190 n
= 1 + (lp
-cut
->list
)/2;
191 qsort(lp
=cut
->list
,n
,2*sizeof(*lp
),mycomp
);
192 /* eliminate overlapping regions */
193 for(n
=0,range
= -2,dp
=lp
; *lp
!=HUGE
; lp
+=2)
202 if((c
= lp
[0]+lp
[1]-range
)>0)
210 range
= *dp
++ = lp
[0];
216 range
+= (*dp
++ = lp
[1]);
221 /* convert ranges into gaps */
222 for(n
=0; *lp
!=HUGE
; lp
+=2)
235 error(ERROR_exit(1),"bad list for c/f option");
242 error(ERROR_exit(1),"bad list for c/f option");
250 * cut each line of file <fdin> and put results to <fdout> using list <list>
254 cutcols(Cut_t
* cut
, Sfio_t
* fdin
, Sfio_t
* fdout
)
258 register int ncol
= 0;
259 register const int* lp
= cut
->list
;
261 register int skip
; /* non-zero for don't copy */
268 if (len
= cut
->reclen
)
269 bp
= sfreserve(fdin
, len
, -1);
271 bp
= sfgetr(fdin
, '\n', 0);
272 if (!bp
&& !(bp
= sfgetr(fdin
, 0, SF_LASTR
)))
277 if (!(ncol
= skip
= *(lp
= cut
->list
)))
284 register const char* s
= bp
;
285 register int w
= len
< ncol
? len
: ncol
;
292 else if ((z
= mblen(s
, w
)) <= 0)
297 bp
= (char*)(s
= xx
);
311 ncol
= !w
&& ncol
>= len
;
315 register const char* s
= bp
;
316 register int w
= len
;
319 while (w
> 0 && ncol
> 0)
322 if (!(*s
& 0x80) || (z
= mblen(s
, w
)) <= 0)
329 ncol
= !w
&& (ncol
|| !skip
);
333 if ((c
= ncol
) > len
)
335 else if (c
== len
&& !skip
)
341 if (sfwrite(fdout
, (char*)bp
, c
) < 0)
351 } while (ncol
!= HUGE
);
352 if (!cut
->nlflag
&& (skip
|| must
|| cut
->reclen
))
354 if (cut
->ldelim
.len
> 1)
355 sfwrite(fdout
, cut
->ldelim
.str
, cut
->ldelim
.len
);
357 sfputc(fdout
, cut
->ldelim
.chr
);
363 * cut each line of file <fdin> and put results to <fdout> using list <list>
364 * stream <fdin> must be line buffered
368 cutfields(Cut_t
* cut
, Sfio_t
* fdin
, Sfio_t
* fdout
)
370 register unsigned char *sp
= cut
->space
;
371 register unsigned char *cp
;
372 register unsigned char *wp
;
373 register int c
, nfields
;
374 register const int *lp
= cut
->list
;
375 register unsigned char *copy
;
376 register int nodelim
, empty
, inword
=0;
377 register unsigned char *ep
;
378 unsigned char *bp
, *first
;
384 /* process each buffer */
385 while ((bp
= (unsigned char*)sfreserve(fdin
, SF_UNBOUND
, -1)) && (c
= sfvalue(fdin
)) > 0)
389 if((lastchar
= cp
[c
]) != cut
->eob
)
391 /* process each line in the buffer */
399 if (nfields
= *(lp
= cut
->list
))
409 /* skip over non-delimiter characters */
413 switch (c
= sp
[*(unsigned char*)cp
++])
419 while ((c
= mb2wc(w
, cp
, ep
- cp
)) <= 0)
421 /* mb char possibly spanning buffer boundary -- fun stuff */
422 if ((ep
- cp
) < mbmax())
428 if (lastchar
!= cut
->eob
)
431 if ((c
= mb2wc(w
, cp
, ep
- cp
)) > 0)
437 if ((c
= cp
- copy
) > 0 && sfwrite(fdout
, (char*)copy
, c
) < 0)
440 for (i
= 0; i
<= (ep
- cp
); i
++)
442 if (!(bp
= (unsigned char*)sfreserve(fdin
, SF_UNBOUND
, -1)) || (c
= sfvalue(fdin
)) <= 0)
446 if ((lastchar
= cp
[c
]) != cut
->eob
)
452 if ((c
= mb2wc(w
, (char*)mb
, j
)) <= 0)
457 first
= bp
= cp
+= c
- i
;
461 if (w
== cut
->ldelim
.chr
)
462 lastchar
= cut
->ldelim
.chr
;
463 else if (w
!= cut
->wdelim
.chr
)
466 if (sfwrite(fdout
, (char*)mb
, c
) < 0)
481 if (c
== cut
->wdelim
.chr
)
486 if (c
== cut
->ldelim
.chr
)
500 while (!(c
= sp
[*cp
++]));
503 /* check for end-of-line */
508 if (lastchar
== cut
->ldelim
.chr
)
510 /* restore cut->last character */
511 if (lastchar
!= cut
->eob
)
524 if ((c
= wp
- copy
) > 0 && sfwrite(fdout
, (char*)copy
, c
) < 0)
529 /* set to delimiter unless the first field */
530 copy
= empty
? cp
: wp
;
542 sfseek(fdtmp
,(Sfoff_t
)0,SEEK_SET
);
543 sfmove(fdtmp
,fdout
,offset
,-1);
552 sfseek(fdtmp
,offset
=0,SEEK_SET
);
554 if (copy
&& (c
=cp
-copy
)>0 && (!nodelim
|| !cut
->sflag
) && sfwrite(fdout
,(char*)copy
,c
)< 0)
557 /* see whether to save in tmp file */
558 if(inword
&& nodelim
&& !cut
->sflag
&& (c
=cp
-first
)>0)
560 /* copy line to tmpfile in case no fields */
562 fdtmp
= sftmp(BLOCK
);
563 sfwrite(fdtmp
,(char*)first
,c
);
573 b_cut(int argc
, char** argv
, void* context
)
575 register char* cp
= 0;
585 cmdinit(argc
, argv
, context
, ERROR_CATALOG
, 0);
588 wdelim
.len
= ldelim
.len
= 1;
591 switch (n
= optget(argv
, usage
))
599 error(2, "f option already specified");
609 ldelim
.str
= opt_info
.arg
;
613 ldelim
.chr
= mbchar(s
);
614 if ((n
= s
- opt_info
.arg
) > 1)
620 ldelim
.chr
= *(unsigned char*)opt_info
.arg
;
624 wdelim
.str
= opt_info
.arg
;
628 wdelim
.chr
= mbchar(s
);
629 if ((n
= s
- opt_info
.arg
) > 1)
635 wdelim
.chr
= *(unsigned char*)opt_info
.arg
;
639 if(mode
&(C_CHARS
|C_BYTES
))
641 error(2, "c option already specified");
656 reclen
= opt_info
.num
;
662 error(2, "%s", opt_info
.arg
);
665 error(ERROR_usage(2), "%s", opt_info
.arg
);
670 argv
+= opt_info
.index
;
671 if (error_info
.errors
)
672 error(ERROR_usage(2), "%s",optusage(NiL
));
675 error(2, "b, c or f option must be specified");
676 error(ERROR_usage(2), "%s", optusage(NiL
));
679 error(3, "non-empty b, c or f option must be specified");
680 if((mode
& (C_FIELDS
|C_SUPRESS
)) == C_SUPRESS
)
681 error(3, "s option requires f option");
682 cut
= cutinit(mode
, cp
, &wdelim
, &ldelim
, reclen
);
687 if(!cp
|| streq(cp
,"-"))
689 else if(!(fp
= sfopen(NiL
,cp
,"r")))
691 error(ERROR_system(0),"%s: cannot open",cp
);
695 cutfields(cut
,fp
,sfstdout
);
697 cutcols(cut
,fp
,sfstdout
);
700 } while(cp
= *argv
++);
701 if (sfsync(sfstdout
))
702 error(ERROR_system(0), "write error");
703 return error_info
.errors
!= 0;