bugfix
[hband-tools.git] / tabdata / td-filter
blob0a3ba867e178666e50192bde3890305db1041c5e
1 #!/usr/bin/env perl
3 =pod
5 =head1 NAME
7 td-filter - Show only those records from the input tabular data stream which match to the conditions.
9 =head1 USAGE
11 td-filter [I<OPTIONS>] [--] I<COLUMN> I<OPERATOR> I<R-VALUE> [[or] I<COLUMN> I<OPERATOR> I<R-VALUE> [[or] ...]]
13 td-filter [I<OPTIONS>] --perl I<EXPR>
15 =head1 DESCRIPTION
17 Pass through those records which match at least one of the conditions (inclusive OR).
18 A condition consists of a triplet of I<COLUMN>, I<OPERATOR>, and I<R-VALUE>.
19 You may put together conditions conjunctively (AND) by chaining multiple td-filter(1) commands by shell pipes.
20 Example:
22 td-filter NAME eq john NAME eq jacob | tr-filter AGE -gt 18
24 This gives the records with either john or jacob, and all of them will be above 18.
26 The optional word "B<or>" between triplets makes your code more explicite.
28 td-filter(1) evaluates the Perl expression in the second form and passes through records
29 only if the result is true-ish in Perl (non zero, non empty string, etc).
30 Each field's value is in @F by index, and in %F by column name.
31 You can implement more complex conditions in this way.
33 =head1 OPTIONS
35 =over 4
37 =item -H, --no-header
39 do not show headers
41 =item -h, --header
43 show headers (default)
45 =item -i, --ignore-non-existing-columns
47 do not treat non-existing (missing or typo) column names as failure
49 =item -w, --warn-non-existing-columns
51 only show warning on non-existing (missing or typo) column names, but don't fail
53 =item -N, --no-fail-non-numeric
55 do not fail when a non-numeric r-value is given to a numeric operator
57 =item -W, --no-warn-non-numeric
59 do not show warning when a non-numeric r-value is given to a numeric operator
61 =back
63 =head1 OPERATORS
65 These operators are supported, semantics are the same as in Perl, see perlop(1).
67 == != <= >= < > =~ !~ eq ne gt lt
69 For your convenience, not to bother with escaping, you may also use these operators as alternatives to the canonical ones above:
71 =over 4
73 =item is
75 string equality (B<eq>)
77 =item is not
79 =item isnt
81 =item isnot
83 string inequality (B<ne>)
85 =item -eq
87 numerical equality (B<==>)
89 =item -ne
91 numerical inequality (B<!=>)
93 =item <>
95 numerical inequality (B<!=>)
97 =item -gt
99 numerical greater than (B<E<gt>>)
101 =item -ge
103 numerical greater or equal (B<E<gt>=>)
105 =item -lt
107 numerical less than (B<E<lt>>)
109 =item -le
111 numerical less or equal (B<E<lt>=>)
113 =item match
115 regexp match (B<=~>)
117 =item notmatch
119 =item does not match
121 negated regexp match (B<!~>)
123 =back
125 Other operators:
127 =over 4
129 =item [is] one of
131 I<R-VALUE> is split into pieces by commas (C<,>) and
132 equality to at least one of them is required
134 =item [contains | has] [whole word]
136 Substring match.
137 Plural forms ("contain" and "have") are also accepted.
138 Optional I<whole word> is a literal part of the operator.
140 =item [contains | has] one of [whole word]
142 Similar to B<is one of>, but substring match is checked
143 instead of full string equality.
144 Optional I<whole word> is a literal part of the operator.
146 =back
148 Operators may be preceeded by I<not>, I<does not>, I<do not> to negate their effect.
150 =head1 CAVEATS
152 If there is no I<COLUMN> column in the input data, it's silently considered empty.
153 td-filter(1) does not need I<R-VALUE> to be quoted or escaped, however your shell may do.
155 =head1 REFERENCES
157 td-filter(1) is analogous to SQL WHERE.
159 =cut
162 $OptShowHeader = 1;
163 $OptWarnBadColumnNames = 1;
164 $OptFailBadColumnNames = 1;
165 $OptWarnNonNumericRValue = 1;
166 $OptFailNonNumericRValue = 1;
167 %OptionDefs = (
168 'H|no-header' => sub { $OptShowHeader = 0; },
169 'h|header' => sub { $OptShowHeader = 1; },
170 'i|ignore-non-existing-columns' => sub { $OptFailBadColumnNames = 0; $OptWarnBadColumnNames = 0; },
171 'w|warn-non-existing-columns' => sub { $OptFailBadColumnNames = 0; $OptWarnBadColumnNames = 1; },
172 'N|no-fail-non-numeric' => sub { $OptFailNonNumericRValue = 0; },
173 'W|no-warn-non-numeric' => sub { $OptWarnNonNumericRValue = 0; },
176 use Switch;
177 use List::MoreUtils qw/all any none/;
178 use Data::Dumper;
179 no if ($] >= 5.018), 'warnings' => 'experimental::smartmatch';
180 do '/usr/lib/tool/perl5/tabdata/common.pl' or die "$@";
182 @Filter = ();
183 $Expr = undef;
185 %operator_alias = (
186 '-eq', '==',
187 'is', 'eq',
188 '<>', '!=',
189 '-ne', '!=',
190 'isnt', 'ne',
191 'isnot', 'ne',
192 'is not', 'ne',
193 '-gt', '>',
194 '-lt', '<',
195 '-ge', '>=',
196 '-le', '<=',
197 'notmatch', '!~',
198 'match', '=~',
200 %operator_alterform = (
201 'have', 'contains',
202 'has', 'contains',
203 'contain', 'contains',
206 if($ARGV[0] eq '--perl')
208 shift @ARGV;
209 die "$0: missing Perl expression\n" unless @ARGV;
210 $Expr = join ';', @ARGV;
212 else
214 while(scalar @ARGV)
216 shift @ARGV if $ARGV[0] eq 'or';
218 my $column = shift @ARGV;
220 my $negation = 0;
221 if($ARGV[0] eq 'not') { $negation = 1; shift @ARGV; }
222 elsif($ARGV[0] ~~ ['does', 'do'] and $ARGV[1] eq 'not') { $negation = 1; shift @ARGV; shift @ARGV; }
224 my $operator = shift @ARGV or die "$0: missing operator for: $column\n";
225 if($operator eq 'is' and $ARGV[0] eq 'not') { $operator .= ' ' . shift @ARGV; }
226 $operator = $operator_alterform{$operator} if exists $operator_alterform{$operator};
227 if($operator ~~ ['contains', 'is'] and $ARGV[0] ~~ ['one', 'any'] and $ARGV[1] eq 'of')
228 { $operator .= '-one-of'; shift @ARGV; shift @ARGV; }
230 $operator = $operator_alias{$operator} if exists $operator_alias{$operator};
232 my $wholeword = 0;
233 if($operator ~~ [qw/contains contains-one-of/] and $ARGV[0] eq 'whole' and $ARGV[1] eq 'word')
234 { $wholeword = 1; shift @ARGV; shift @ARGV; }
236 my $rvalue = shift @ARGV;
237 die "$0: missing r-value for: $column $operator\n" if not defined $rvalue;
240 unless($operator ~~ [qw/== != <= >= < > =~ !~ eq ne gt lt is-one-of contains contains-one-of/])
242 die "$0: unknown operator: $operator\n";
245 if($operator ~~ [qw/== != <= >= < >/] and $rvalue !~ /^[+-]?[0-9]\d*(.\d+|)(E[+-]?\d+|)$/i)
247 warn "$0: operator ($operator) is numeric but r-value ($rvalue) is not\n" if $OptWarnNonNumericRValue;
248 exit -1 if $OptFailNonNumericRValue;
251 push @Filter, {'field'=>$column, 'operator'=>$operator, 'negation'=>$negation, 'value'=>$rvalue, 'wholeword'=>$wholeword};
255 process_header(scalar <STDIN>);
257 if(not defined $Expr)
259 for my $filter (@Filter)
261 my $colname = $filter->{'field'};
262 if(not exists $Header{$colname})
264 my $cols = join ", ", @Header;
265 warn "$0: $colname: no such column. known columns: $cols\n" if $OptWarnBadColumnNames;
266 exit 3 if $OptFailBadColumnNames;
271 if($OptShowHeader and length $HeaderLine)
273 print $HeaderLine.$RS;
276 while(not eof STDIN)
278 my @Field = read_record(\*STDIN);
280 if(defined $Expr)
282 my @F = @Field;
283 my %F = map {$_=>$Field[$Header{$_}]} keys %Header;
284 $pass = eval $Expr;
285 warn $@ if $@;
287 else
289 $pass = 0;
291 for my $filter (@Filter)
293 my $left_data = exists $Header{$filter->{'field'}} ? $Field[$Header{$filter->{'field'}}] : '';
294 my $right_data = $filter->{'value'};
295 my $operator = $filter->{'operator'};
296 my $result;
298 if($filter->{'wholeword'})
300 # have to match "whole word" - simple substring match won't be enough.
301 # transform "contains" and "contains-one-of" operators into a regexp match.
303 if($operator eq 'contains-one-of')
305 $right_data = '\b(' . (join '|', map { quotemeta($_) } split /,/, $right_data) . ')\b';
307 elsif($operator eq 'contains')
309 $right_data = '\b' . quotemeta($right_data) . '\b';
311 else { die "$0: should not happen\n"; }
312 $operator = '=~';
315 switch($operator)
317 case 'contains'
319 $result = 1 + index $left_data, $right_data;
321 case 'is-one-of'
323 $result = any { $left_data eq $_ } split /,/, $right_data;
325 case 'contains-one-of'
327 $result = any { 1 + index $left_data, $_ } split /,/, $right_data;
329 else
331 my $left_code = '$left_data';
332 my $right_code = '$right_data';
333 $right_code = "/$right_code/" if $operator ~~ ['=~', '!~'];
334 $result = eval "$left_code $operator $right_code";
338 $result = !$result if $filter->{'negation'};
340 if($result)
342 $pass = 1;
343 last;
348 if($pass)
350 print join($FS, @Field).$RS;