make getpeername() return the original socket address which before it was intercepted
[hband-tools.git] / tabdata / td-collapse
blob8b21823c093839d994d69b03b88f373582d11983
1 #!/usr/bin/env perl
3 =pod
5 =head1 NAME
7 td-collapse - Collapse multiple tabular data records with equivalent keys into one.
9 =head1 SYNOPSIS
11 td-collapse [I<OPTIONS>]
13 =head1 DESCRIPTION
15 It goes row-by-row on a sorted tabular data stream
16 and if 2 or more subsequent rows' first (key) cell are
17 the same then collapse them into one row.
18 This is done by joining corresponding cells' data from each row into one
19 cell, effectively keeping every column's data in the same column.
21 If you want to group by an other column, not the first one, then first
22 reorder the columns by td-select(1). Eg. C<td-select KEYCOLUMN +REST>.
24 =head1 OPTIONS
26 =over 4
28 =item -g, --glue I<STR>
30 Delimiter character or string between joined cell data.
31 Default is space.
33 =item -u, --distribute-unique-field I<FIELD>
35 Take the I<FIELD> column's cells from the first collapsed group,
36 and multiplicate all other columns as many times as many rows are in this group,
37 in a way that each cell goes under a new column corresponding to that cell's original row.
38 I<FIELD> field's cells need to be unique within each groups.
40 If an unexpected value found during processing the 2nd row group and onwards,
41 ie. a value which was not there in the first group,
42 it won't be distibuted into the new column, since the header is already sent,
43 but left in the original column just like B<-u> option would not be in effect.
44 See "pause" and "resume" in the example below.
46 B<Example>:
48 ID | EVENT | TIME | STATUS
49 15 | start | 10:00 |
50 15 | end | 10:05 | ok
51 16 | start | 11:00 |
52 16 | end | 11:06 | err
53 16 | pause | 11:04 |
54 16 | resume | 11:05 |
56 td-collapse -u EVENT -z
58 COUNT | ID | EVENT | TIME | TIME_start | TIME_end | STATUS | STATUS_start | STATUS_end
59 2 | 15 | | | 10:00 | 10:05 | | | ok
60 4 | 16 | pause resume | 11:04 11:05 | 11:00 | 11:06 | | | err
62 =item -s, --distributed-column-name-separator I<STR>
64 When generating new columns as described at B<-u> option,
65 join the original column name with each of the unique field's values
66 by I<STR> string.
67 See example at B<-u> option description.
68 Default is underscore C<_>.
70 =item -k, --keep-equivalent-cells-united
72 Don't repeat the original cells' content
73 in the collapsed cell if all of the original cell are the same.
75 =item -z, --empty-distributed-cells
77 Clear cells of which data moved to other columns by B<-u> option.
79 =back
81 =head1 EXAMPLES
83 This pipeline shows which users are using each of the configured default
84 shells, grouped by shell path.
86 # get the list of users
87 getent passwd |\
89 # transform into tabular data stream
90 tr : "\t" |\
91 td-add-headers USER X UID GID GECOS HOME SHELL |\
93 # put the shell in the first column, and sort, then collapse
94 td-select SHELL USER | td-keepheader sort | td-collapse -g ' ' |\
96 # change header name "USER" to "USERS"
97 td-alter USERS=USER | td-select +ALL -USER
99 B<Output>:
101 | COUNT | SHELL | USERS |
102 | 4 | /bin/bash | user1 user2 nova root |
103 | 5 | /bin/false | fetchmail hplip sddm speech-dispatcher sstpc |
104 | 1 | /bin/sync | sync |
105 | 1 | /sbin/rebootlogon | reboot |
106 | 6 | /usr/sbin/nologin | _apt avahi avahi-autoipd backup bin daemon |
108 =head1 CAVEATS
110 Have to sort input data first.
112 Group key is always the first input column.
114 If a row in the input data has more cells than the number of columns, those are ignored.
116 =head1 SEE ALSO
118 td-expand(1) is a kind of an inverse to td-collapse(1).
120 =head1 REFERENCES
122 td-collapse(1) roughly translates to SELECT COUNT(*) + GROUP_CONCAT() + GROUP BY in SQL.
124 =cut
127 $OptGlue = " ";
128 $OptDistUniqueField = undef;
129 $OptDistColumnNameSeparator = "_";
130 $OptKeepSameCells = 0;
131 $OptEmptyDistCells = 0;
132 %OptionDefs = (
133 'g|glue=s' => \$OptGlue,
134 'u|distribute-unique-field=s' => \$OptDistUniqueField,
135 's|distributed-column-name-separator=s' => \$OptDistColumnNameSeparator,
136 'k|keep-equivalent-cells-united!' => \$OptKeepSameCells,
137 'z|empty-distributed-cells!' => \$OptEmptyDistCells,
140 use Data::Dumper;
141 use List::MoreUtils qw/all/;
142 no if ($] >= 5.018), 'warnings' => 'experimental::smartmatch';
143 do '/usr/lib/tool/perl5/tabdata/common.pl' or die "$@";
145 process_header(scalar <STDIN>);
146 $last_input_column_idx = $#Header;
148 if(defined $OptDistUniqueField)
150 die "$0: no such column: $OptDistUniqueField\n" if not exists $Header{$OptDistUniqueField};
151 $uniq_field_idx = $Header{$OptDistUniqueField};
155 sub make_collapsed_cell
157 my $first_input_cell = $_[0];
158 if($OptKeepSameCells and all {$_ eq $first_input_cell} @_)
160 return $first_input_cell;
162 return join $OptGlue, @_;
165 sub commit_group
167 if($group_count == 0)
169 if($OptDistUniqueField)
171 for my $colidx (1..$#Header)
173 next if $colidx == $uniq_field_idx;
174 @distributed_cells = @{$group_members[$uniq_field_idx]};
175 for my $uniq_cell (@distributed_cells)
177 my $distrib_colname = $Header[$colidx] . $OptDistColumnNameSeparator . $uniq_cell;
178 push @Header, $distrib_colname;
179 $Header{$distrib_colname} = $#Header;
183 print join($FS, "COUNT", @Header).$RS;
186 if($OptDistUniqueField)
188 for my $uniq_cell_idx (0..$#{$group_members[$uniq_field_idx]})
190 my $uniq_cell = $group_members[$uniq_field_idx]->[$uniq_cell_idx];
191 next if not $uniq_cell ~~ @distributed_cells;
192 for my $colidx (1..$last_input_column_idx)
194 my $colname = $Header[$colidx];
195 my $distrib_colname = $colname . $OptDistColumnNameSeparator . $uniq_cell;
196 $group_members[$Header{$distrib_colname}]->[0] = $group_members[$colidx]->[$uniq_cell_idx];
197 $group_members[$colidx]->[$uniq_cell_idx] = undef if $OptEmptyDistCells;
199 $group_members[$uniq_field_idx]->[$uniq_cell_idx] = undef if $OptEmptyDistCells;
201 for my $colidx (1..$last_input_column_idx)
203 @{$group_members[$colidx]} = grep {defined} @{$group_members[$colidx]};
205 @{$group_members[$uniq_field_idx]} = grep {defined} @{$group_members[$uniq_field_idx]};
208 print $collapsed_rows . $FS . $prev_group_key . $FS . join($FS, map {make_collapsed_cell(@$_)} @group_members[1..$#group_members]) . $RS;
209 @group_members = ();
210 $collapsed_rows = 0;
211 $group_count++;
214 $group_key = undef;
215 $prev_group_key = undef;
216 @group_members = ();
217 $collapsed_rows = 0;
218 $group_count = 0;
219 @distributed_cells = ();
222 while(not eof STDIN)
224 my @input_row = read_record(\*STDIN);
226 $group_key = $input_row[0];
228 if(defined $prev_group_key and $group_key ne $prev_group_key)
230 commit_group;
233 for my $cell_idx (1..$last_input_column_idx)
235 my $cell = $input_row[$cell_idx];
236 $cell = '' unless defined $cell;
237 push @{$group_members[$cell_idx]}, $cell;
240 $collapsed_rows++;
241 $prev_group_key = $group_key;
244 END { commit_group; }