add unicode-aware expand command
[hband-tools.git] / tabdata / td-filter
blobcdb47f06b0418c79a362f28409d3ea95d20cbaa1
1 #!/usr/bin/env perl
3 =pod
5 =head1 NAME
7 td-filter - Show only those records from the input tabular data stream which match to the conditions.
9 =head1 USAGE
11 td-filter [I<OPTIONS>] [--] I<COLUMN> I<OPERATOR> I<R-VALUE> [[or] I<COLUMN> I<OPERATOR> I<R-VALUE> [[or] ...]]
13 td-filter [I<OPTIONS>] --perl I<EXPR>
15 =head1 DESCRIPTION
17 Pass through those records which match at least one of the conditions (inclusive OR).
18 A condition consists of a triplet of I<COLUMN>, I<OPERATOR>, and I<R-VALUE>.
19 You may put together conditions conjunctively (AND) by chaining multiple td-filter(1) commands by shell pipes.
20 Example:
22 td-filter NAME eq john NAME eq jacob | tr-filter AGE -gt 18
24 This gives the records with either john or jacob, and all of them will be above 18.
26 The optional word "B<or>" between triplets makes your code more explicite.
28 td-filter(1) evaluates the Perl expression in the second form and passes through records
29 only if the result is true-ish in Perl (non zero, non empty string, etc).
30 Each field's value is in @F by index, and in %F by column name.
31 You can implement more complex conditions in this way.
33 =head1 OPTIONS
35 =over 4
37 =item -H, --no-header
39 do not show headers
41 =item -h, --header
43 show headers (default)
45 =item -i, --ignore-non-existing-columns
47 do not treat non-existing (missing or typo) column names as failure
49 =item -w, --warn-non-existing-columns
51 only show warning on non-existing (missing or typo) column names, but don't fail
53 =item -N, --no-fail-non-numeric
55 do not fail when a non-numeric r-value is given to a numeric operator
57 =item -W, --no-warn-non-numeric
59 do not show warning when a non-numeric r-value is given to a numeric operator
61 =back
63 =head1 OPERATORS
65 These operators are supported, semantics are the same as in Perl, see perlop(1).
67 == != <= >= < > =~ !~ eq ne gt lt
69 For your convenience, not to bother with escaping, you may also use these operators as alternatives to the canonical ones above:
71 =over 4
73 =item is
75 string equality (B<eq>)
77 =item is not
79 =item isnt
81 =item isnot
83 string inequality (B<ne>)
85 =item -eq
87 numerical equality (B<==>)
89 =item -ne
91 numerical inequality (B<!=>)
93 =item <>
95 numerical inequality (B<!=>)
97 =item -gt
99 numerical greater than (B<E<gt>>)
101 =item -ge
103 numerical greater or equal (B<E<gt>=>)
105 =item -lt
107 numerical less than (B<E<lt>>)
109 =item -le
111 numerical less or equal (B<E<lt>=>)
113 =item match
115 regexp match (B<=~>)
117 =item notmatch
119 =item does not match
121 negated regexp match (B<!~>)
123 =back
125 Other operators:
127 =over 4
129 =item [is] one of
131 I<R-VALUE> is split into pieces by commas (C<,>) and
132 equality to at least one of them is required
134 =item [contains | has] [whole word]
136 Substring match.
137 Plural forms ("contain" and "have") are also accepted.
138 Optional I<whole word> is a literal part of the operator.
140 =item [contains | has] one of [whole word]
142 Similar to B<is one of>, but substring match is checked
143 instead of full string equality.
144 Optional I<whole word> is a literal part of the operator.
146 =item ends with
148 =item starts with
150 =back
152 Operators may be preceeded by I<not>, I<does not>, I<do not> to negate their effect.
154 =head1 CAVEATS
156 If there is no I<COLUMN> column in the input data, it's silently considered empty.
157 td-filter(1) does not need I<R-VALUE> to be quoted or escaped, however your shell may do.
159 =head1 REFERENCES
161 td-filter(1) is analogous to SQL WHERE.
163 =cut
166 $OptShowHeader = 1;
167 $OptWarnBadColumnNames = 1;
168 $OptFailBadColumnNames = 1;
169 $OptWarnNonNumericRValue = 1;
170 $OptFailNonNumericRValue = 1;
171 %OptionDefs = (
172 'H|no-header' => sub { $OptShowHeader = 0; },
173 'h|header' => sub { $OptShowHeader = 1; },
174 'i|ignore-non-existing-columns' => sub { $OptFailBadColumnNames = 0; $OptWarnBadColumnNames = 0; },
175 'w|warn-non-existing-columns' => sub { $OptFailBadColumnNames = 0; $OptWarnBadColumnNames = 1; },
176 'N|no-fail-non-numeric' => sub { $OptFailNonNumericRValue = 0; },
177 'W|no-warn-non-numeric' => sub { $OptWarnNonNumericRValue = 0; },
180 use Switch;
181 use List::MoreUtils qw/all any none/;
182 use Data::Dumper;
183 no if ($] >= 5.018), 'warnings' => 'experimental::smartmatch';
184 do '/usr/lib/tool/perl5/tabdata/common.pl' or die "$@";
186 @Filter = ();
187 $Expr = undef;
189 %operator_alias = (
190 '-eq', '==',
191 'is', 'eq',
192 '<>', '!=',
193 '-ne', '!=',
194 'isnt', 'ne',
195 'isnot', 'ne',
196 'is not', 'ne',
197 '-gt', '>',
198 '-lt', '<',
199 '-ge', '>=',
200 '-le', '<=',
201 'notmatch', '!~',
202 'match', '=~',
204 %operator_alterform = (
205 'have', 'contains',
206 'has', 'contains',
207 'contain', 'contains',
208 'start', 'starts',
209 'end', 'ends',
210 'startwith', 'starts-with',
211 'endwith', 'ends-with',
212 'startswith', 'starts-with',
213 'endswith', 'ends-with',
216 if($ARGV[0] eq '--perl')
218 shift @ARGV;
219 die "$0: missing Perl expression\n" unless @ARGV;
220 $Expr = join ';', @ARGV;
222 else
224 while(scalar @ARGV)
226 shift @ARGV if $ARGV[0] eq 'or';
228 my $column = shift @ARGV;
230 my $negation = 0;
231 if($ARGV[0] eq 'not') { $negation = 1; shift @ARGV; }
232 elsif($ARGV[0] ~~ ['does', 'do'] and $ARGV[1] eq 'not') { $negation = 1; shift @ARGV; shift @ARGV; }
234 my $operator = shift @ARGV or die "$0: missing operator for: $column\n";
236 # handle multi-word operators:
237 if($operator eq 'is' and $ARGV[0] eq 'not') { $operator .= ' ' . shift @ARGV; }
238 $operator = $operator_alterform{$operator} if exists $operator_alterform{$operator};
239 if($operator ~~ ['contains', 'is'] and $ARGV[0] ~~ ['one', 'any'] and $ARGV[1] eq 'of')
240 { $operator .= '-one-of'; shift @ARGV; shift @ARGV; }
241 if($operator ~~ ['starts', 'ends'] and $ARGV[0] eq 'with')
242 { $operator .= '-' . shift @ARGV; }
244 $operator = $operator_alias{$operator} if exists $operator_alias{$operator};
246 my $wholeword = 0;
247 if($operator ~~ [qw/contains contains-one-of/] and $ARGV[0] eq 'whole' and $ARGV[1] eq 'word')
248 { $wholeword = 1; shift @ARGV; shift @ARGV; }
250 my $rvalue = shift @ARGV;
251 die "$0: missing r-value for: $column $operator\n" if not defined $rvalue;
254 unless($operator ~~ [qw/== != <= >= < > =~ !~ eq ne gt lt is-one-of contains contains-one-of ends-with starts-with/])
256 die "$0: unknown operator: $operator\n";
259 if($operator ~~ [qw/== != <= >= < >/] and $rvalue !~ /^[+-]?[0-9]\d*(.\d+|)(E[+-]?\d+|)$/i)
261 warn "$0: operator ($operator) is numeric but r-value ($rvalue) is not\n" if $OptWarnNonNumericRValue;
262 exit -1 if $OptFailNonNumericRValue;
265 push @Filter, {'field'=>$column, 'operator'=>$operator, 'negation'=>$negation, 'value'=>$rvalue, 'wholeword'=>$wholeword};
269 process_header(scalar <STDIN>);
271 if(not defined $Expr)
273 for my $filter (@Filter)
275 my $colname = $filter->{'field'};
276 if(not exists $Header{$colname})
278 my $cols = join ", ", @Header;
279 warn "$0: $colname: no such column. known columns: $cols\n" if $OptWarnBadColumnNames;
280 exit 3 if $OptFailBadColumnNames;
285 if($OptShowHeader and length $HeaderLine)
287 print $HeaderLine.$RS;
290 while(not eof STDIN)
292 my @Field = read_record(\*STDIN);
294 if(defined $Expr)
296 my @F = @Field;
297 my %F = map {$_=>$Field[$Header{$_}]} keys %Header;
298 $pass = eval $Expr;
299 warn $@ if $@;
301 else
303 $pass = 0;
305 for my $filter (@Filter)
307 my $left_data = exists $Header{$filter->{'field'}} ? $Field[$Header{$filter->{'field'}}] : '';
308 my $right_data = $filter->{'value'};
309 my $operator = $filter->{'operator'};
310 my $result;
312 if($filter->{'wholeword'})
314 # have to match "whole word" - simple substring match won't be enough.
315 # transform "contains" and "contains-one-of" operators into a regexp match.
317 if($operator eq 'contains-one-of')
319 $right_data = '\b(' . (join '|', map { quotemeta($_) } split /,/, $right_data) . ')\b';
321 elsif($operator eq 'contains')
323 $right_data = '\b' . quotemeta($right_data) . '\b';
325 else { die "$0: should not happen\n"; }
326 $operator = '=~';
329 switch($operator)
331 case 'contains'
333 $result = 1 + index $left_data, $right_data;
335 case 'is-one-of'
337 $result = any { $left_data eq $_ } split /,/, $right_data;
339 case 'contains-one-of'
341 $result = any { 1 + index $left_data, $_ } split /,/, $right_data;
343 case 'starts-with'
345 $result = $right_data eq substr($left_data, 0, length $right_data);
347 case 'ends-with'
349 $result = $right_data eq substr($left_data, -length $right_data);
351 else
353 my $left_code = '$left_data';
354 my $right_code = '$right_data';
355 $right_code = "/$right_code/" if $operator ~~ ['=~', '!~'];
356 $result = eval "$left_code $operator $right_code";
360 $result = !$result if $filter->{'negation'};
362 if($result)
364 $pass = 1;
365 last;
370 if($pass)
372 print join($FS, @Field).$RS;