upload pod file
[hband-tools.git] / tabdata / td-filter
blobe8e68a70f7bcfea8807d933619b73dfaaa24e703
1 #!/usr/bin/env perl
3 =pod
5 =head1 NAME
7 td-filter - Show only those records from the input tabular data stream which match to the conditions.
9 =head1 USAGE
11 td-filter [I<OPTIONS>] [--] I<COLUMN> I<OPERATOR> I<R-VALUE> [[or] I<COLUMN> I<OPERATOR> I<R-VALUE> [[or] ...]]
13 td-filter [I<OPTIONS>] --perl I<EXPR>
15 =head1 DESCRIPTION
17 Pass through those records which match at least one of the conditions (inclusive OR).
18 A condition consists of a triplet of I<COLUMN>, I<OPERATOR>, and I<R-VALUE>.
19 You may put together conditions conjunctively (AND) by chaining multiple td-filter(1) commands by shell pipes.
20 Example:
22 td-filter NAME eq john NAME eq jacob | tr-filter AGE -gt 18
24 This gives the records with either john or jacob, and all of them will be above 18.
26 The optional word "B<or>" between triplets makes your code more explicite.
28 td-filter(1) evaluates the Perl expression in the second form and passes through records
29 only if the result is true-ish in Perl (non zero, non empty string, etc).
30 Each field's value is in @F by index, and in %F by column name.
31 You can implement more complex conditions in this way.
33 =head1 OPTIONS
35 =over 4
37 =item -H, --no-header
39 do not show headers
41 =item -h, --header
43 show headers (default)
45 =item -i, --ignore-non-existing-columns
47 do not treat non-existing (missing or typo) column names as failure
49 =item -w, --warn-non-existing-columns
51 only show warning on non-existing (missing or typo) column names, but don't fail
53 =item -N, --no-fail-non-numeric
55 do not fail when a non-numeric r-value is given to a numeric operator
57 =item -W, --no-warn-non-numeric
59 do not show warning when a non-numeric r-value is given to a numeric operator
61 =back
63 =head1 OPERATORS
65 These operators are supported, semantics are the same as in Perl, see perlop(1).
67 == != <= >= < > =~ !~ eq ne gt lt
69 For your convenience, not to bother with escaping, you may also use these operators as alternatives to the canonical ones above:
71 =over 4
73 =item is
75 =item = I<(single equal sign)>
77 string equality (B<eq>)
79 =item is not
81 string inequality (B<ne>)
83 =item -eq
85 numeric equality (B<==>)
87 =item -ne
89 numeric inequality (B<!=>)
91 =item <>
93 numeric inequality (B<!=>)
95 =item -gt
97 numeric greater than (B<E<gt>>)
99 =item -ge
101 numeric greater or equal (B<E<gt>=>)
103 =item -lt
105 numeric less than (B<E<lt>>)
107 =item -le
109 numeric less or equal (B<E<lt>=>)
111 =item match
113 =item matches
115 regexp match (B<=~>)
117 =item does not match
119 =item do not match
121 =item not match
123 negated regexp match (B<!~>)
125 =back
127 Other operators:
129 =over 4
131 =item is [not] one of
133 =item is [not] any of
135 I<R-VALUE> is split into pieces by commas (C<,>) and
136 equality to at least one of them is required.
137 Equality to none of them is required if the operator is negated.
139 =item contains [whole word]
141 Substring match.
142 Plural form "contain" is also accepted.
143 Optional I<whole word> is a literal part of the operator.
145 =item contains [one | any] [whole word] of
147 Similar to B<is one of>, but substring match is checked
148 instead of full string equality.
149 Plural form "contain" is also accepted.
150 Optional I<whole word> is a literal part of the operator.
152 =item ends with
154 =item starts with
156 Plural forms are also accepted.
158 =back
160 Operators may be preceeded by I<not>, I<does not>, I<do not> to negate their effect.
162 =head1 CAVEATS
164 If there is no I<COLUMN> column in the input data, it's silently considered empty.
165 td-filter(1) does not need I<R-VALUE> to be quoted or escaped, however your shell may do.
167 =head1 REFERENCES
169 td-filter(1) is analogous to SQL WHERE.
171 =cut
174 $OptShowHeader = 1;
175 $OptWarnBadColumnNames = 1;
176 $OptFailBadColumnNames = 1;
177 $OptWarnNonNumericRValue = 1;
178 $OptFailNonNumericRValue = 1;
179 %OptionDefs = (
180 'H|no-header' => sub { $OptShowHeader = 0; },
181 'h|header' => sub { $OptShowHeader = 1; },
182 'i|ignore-non-existing-columns' => sub { $OptFailBadColumnNames = 0; $OptWarnBadColumnNames = 0; },
183 'w|warn-non-existing-columns' => sub { $OptFailBadColumnNames = 0; $OptWarnBadColumnNames = 1; },
184 'N|no-fail-non-numeric' => sub { $OptFailNonNumericRValue = 0; },
185 'W|no-warn-non-numeric' => sub { $OptWarnNonNumericRValue = 0; },
188 use Switch;
189 use List::MoreUtils qw/all any none uniq/;
190 use Data::Dumper;
191 no if ($] >= 5.018), 'warnings' => 'experimental::smartmatch';
192 do '/usr/lib/tool/perl5/tabdata/common.pl' or die "$@";
194 use Math::Cartesian::Product;
197 sub subst_alternate_forms
199 my $words = shift;
200 my $altforms = shift;
201 my @product_terms = ();
202 for my $word (split /\s+/, $words)
204 push @product_terms, ($word ~~ [keys %$altforms]) ? $altforms->{$word} : [$word];
206 my @results = ();
207 cartesian {push @results, join ' ', @_} @product_terms;
208 return @results;
210 sub parse_operator
212 my @args = @_;
213 my %alternate_form = (
214 'ANY' => [qw/one any/],
215 'MATCH' => [qw/match matches/],
216 'CONTAIN' => [qw/contain contains/],
217 'DOES' => [qw/do does/],
218 'START' => [qw/start starts/],
219 'END' => [qw/end ends/],
221 my @opdef = (
222 { phrase => 'MATCH', operator => '=~'},
223 { phrase => 'not MATCH', operator => '!~'},
224 { phrase => 'DOES not match', operator => '!~'},
226 { phrase => 'is not ANY of', operator => 'is-one-of', negation => 1},
227 { phrase => 'is ANY of', operator => 'is-one-of'},
228 { phrase => 'is not', operator => 'ne'},
229 { phrase => 'is', operator => 'eq'},
231 { phrase => '=', operator => 'eq'},
232 { phrase => '-eq', operator => '=='},
233 { phrase => '-ne', operator => '!='},
234 { phrase => '<>', operator => '!='},
235 { phrase => '-gt', operator => '>'},
236 { phrase => '-ge', operator => '>='},
237 { phrase => '-lt', operator => '<'},
238 { phrase => '-le', operator => '<='},
240 { phrase => 'CONTAIN whole word', operator => 'contains', wholeword => 1},
241 { phrase => 'CONTAIN ANY whole word of', operator => 'contains-one-of', wholeword => 1},
242 { phrase => 'CONTAIN ANY of', operator => 'contains-one-of'},
243 { phrase => 'CONTAIN', operator => 'contains'},
244 { phrase => 'not CONTAIN whole word', operator => 'contains', negation => 1, wholeword => 1},
245 { phrase => 'not CONTAIN ANY whole word of', operator => 'contains-one-of', negation => 1, wholeword => 1},
246 { phrase => 'not CONTAIN ANY of', operator => 'contains-one-of', negation => 1},
247 { phrase => 'not CONTAIN', operator => 'contains', negation => 1},
248 { phrase => 'DOES not contain ANY whole word of', operator => 'contains-one-of', negation => 1, wholeword => 1},
249 { phrase => 'DOES not contain whole word', operator => 'contains', negation => 1, wholeword => 1},
250 { phrase => 'DOES not contain ANY of', operator => 'contains-one-of', negation => 1},
251 { phrase => 'DOES not contain', operator => 'contains', negation => 1},
253 { phrase => 'START with', operator => 'starts-with'},
254 { phrase => 'not START with', operator => 'starts-with', negation => 1},
255 { phrase => 'DOES not start with', operator => 'starts-with', negation => 1},
257 { phrase => 'END with', operator => 'ends-with'},
258 { phrase => 'not END with', operator => 'ends-with', negation => 1},
259 { phrase => 'DOES not end with', operator => 'ends-with', negation => 1},
261 my @canonical_operators = uniq map {$_->{operator}} @opdef;
262 for my $opdef (@opdef)
264 my $phrase = $opdef->{phrase};
265 my $nwords = scalar split /\s+/, $phrase;
266 my @phrases = subst_alternate_forms($phrase, \%alternate_form);
267 #warn Dumper \@phrases;
268 for my $possible_operator (@phrases)
270 my $argstr = join ' ', @args[0..($nwords-1)];
271 if($argstr eq $possible_operator)
273 my %result = %$opdef;
274 $result{remaining_args} = [@args[$nwords..$#args]];
275 return \%result;
279 if($args[0] ~~ [@canonical_operators])
281 return {operator => $args[0], remaining_args => [@args[1..$#args]]};
283 die "valid operator expected; found this: @args";
288 @Filter = ();
289 $Expr = undef;
292 if($ARGV[0] eq '--perl')
294 shift @ARGV;
295 die "$0: missing Perl expression\n" unless @ARGV;
296 $Expr = join ';', @ARGV;
298 else
300 while(scalar @ARGV)
302 shift @ARGV if $ARGV[0] eq 'or';
304 my $column = shift @ARGV;
306 my $complex_operator = parse_operator(@ARGV);
307 #warn Dumper $complex_operator;
308 my $operator = $complex_operator->{operator};
309 my $negation = $complex_operator->{negation};
310 my $wholeword = $complex_operator->{wholeword};
311 @ARGV = @{$complex_operator->{remaining_args}};
313 my $rvalue = shift @ARGV;
314 die "$0: missing r-value for: $column $operator\n" if not defined $rvalue;
316 if($operator ~~ [qw/== != <= >= < >/] and $rvalue !~ /^[+-]?[0-9]\d*(.\d+|)(E[+-]?\d+|)$/i)
318 warn "$0: operator ($operator) is numeric but r-value ($rvalue) is not\n" if $OptWarnNonNumericRValue;
319 exit -1 if $OptFailNonNumericRValue;
322 push @Filter, {'field'=>$column, 'operator'=>$operator, 'negation'=>$negation, 'value'=>$rvalue, 'wholeword'=>$wholeword};
326 process_header(scalar <STDIN>);
328 if(not defined $Expr)
330 for my $filter (@Filter)
332 my $colname = $filter->{'field'};
333 if(not exists $Header{$colname})
335 my $cols = join ", ", @Header;
336 warn "$0: $colname: no such column. known columns: $cols\n" if $OptWarnBadColumnNames;
337 exit 3 if $OptFailBadColumnNames;
342 if($OptShowHeader and length $HeaderLine)
344 print $HeaderLine.$RS;
347 while(not eof STDIN)
349 my @Field = read_record(\*STDIN);
351 if(defined $Expr)
353 my @F = @Field;
354 my %F = map {$_=>$Field[$Header{$_}]} keys %Header;
355 $pass = eval $Expr;
356 warn $@ if $@;
358 else
360 $pass = 0;
362 for my $filter (@Filter)
364 my $left_data = exists $Header{$filter->{'field'}} ? $Field[$Header{$filter->{'field'}}] : '';
365 my $right_data = $filter->{'value'};
366 my $operator = $filter->{'operator'};
367 my $result;
369 if($filter->{'wholeword'})
371 # have to match "whole word" - simple substring match won't be enough.
372 # transform "contains" and "contains-one-of" operators into a regexp match.
374 if($operator eq 'contains-one-of')
376 $right_data = '\b(' . (join '|', map { quotemeta($_) } split /,/, $right_data) . ')\b';
378 elsif($operator eq 'contains')
380 $right_data = '\b' . quotemeta($right_data) . '\b';
382 else { die "$0: should not happen\n"; }
383 $operator = '=~';
386 switch($operator)
388 case 'contains'
390 $result = 1 + index $left_data, $right_data;
392 case 'is-one-of'
394 $result = any { $left_data eq $_ } split /,/, $right_data;
396 case 'contains-one-of'
398 $result = any { 1 + index $left_data, $_ } split /,/, $right_data;
400 case 'starts-with'
402 $result = $right_data eq substr($left_data, 0, length $right_data);
404 case 'ends-with'
406 $result = $right_data eq substr($left_data, -length $right_data);
408 else
410 my $left_code = '$left_data';
411 my $right_code = '$right_data';
412 $right_code = "/$right_code/" if $operator ~~ ['=~', '!~'];
413 $result = eval "$left_code $operator $right_code";
417 $result = !$result if $filter->{'negation'};
419 if($result)
421 $pass = 1;
422 last;
427 if($pass)
429 print join($FS, @Field).$RS;