tabdata/td-collapse

   1 #!/usr/bin/env perl
   2
   3 =pod
   4
   5 =head1 NAME
   6
   7 td-collapse - Collapse multiple tabular data records with equivalent keys into one.
   8
   9 =head1 SYNOPSIS
  10
  11 td-collapse [I<OPTIONS>]
  12
  13 =head1 DESCRIPTION
  14
  15 It goes row-by-row on a sorted tabular data stream
  16 and if 2 or more subsequent rows' first (key) cell are
  17 the same then collapse them into one row.
  18 This is done by joining corresponding cells' data from each row into one
  19 cell, effectively keeping every column's data in the same column.
  20
  21 If you want to group by an other column, not the first one, then first
  22 reorder the columns by td-select(1). Eg. C<td-select KEYCOLUMN +REST>.
  23
  24 =head1 OPTIONS
  25
  26 =over 4
  27
  28 =item -g, --glue I<STR>
  29
  30 Delimiter character or string between joined cell data.
  31 Default is space.
  32
  33 =item -u, --distribute-unique-field I<FIELD>
  34
  35 Take the I<FIELD> column's cells from the first collapsed group,
  36 and multiplicate all other columns as many times as many rows are in this group,
  37 in a way that each cell goes under a new column corresponding to that cell's original row.
  38 I<FIELD> field's cells need to be unique within each groups.
  39
  40 If an unexpected value found during processing the 2nd row group and onwards,
  41 ie. a value which was not there in the first group,
  42 it won't be distibuted into the new column, since the header is already sent,
  43 but left in the original column just like B<-u> option would not be in effect.
  44 See "pause" and "resume" in the example below.
  45
  46 B<Example>:
  47
  48  ID | EVENT  | TIME  | STATUS
  49  15 | start  | 10:00 |
  50  15 | end    | 10:05 | ok
  51  16 | start  | 11:00 |
  52  16 | end    | 11:06 | err
  53  16 | pause  | 11:04 |
  54  16 | resume | 11:05 |
  55
  56  td-collapse -u EVENT -z
  57
  58  COUNT | ID | EVENT        | TIME        | TIME_start | TIME_end | STATUS | STATUS_start | STATUS_end
  59  2     | 15 |              |             | 10:00      | 10:05    |        |              | ok
  60  4     | 16 | pause resume | 11:04 11:05 | 11:00      | 11:06    |        |              | err
  61
  62 =item -s, --distributed-column-name-separator I<STR>
  63
  64 When generating new columns as described at B<-u> option,
  65 join the original column name with each of the unique field's values
  66 by I<STR> string.
  67 See example at B<-u> option description.
  68 Default is underscore C<_>.
  69
  70 =item -k, --keep-equivalent-cells-united
  71
  72 Don't repeat the original cells' content
  73 in the collapsed cell if all of the original cell are the same.
  74
  75 =item -z, --empty-distributed-cells
  76
  77 Clear cells of which data moved to other columns by B<-u> option.
  78
  79 =back
  80
  81 =head1 EXAMPLES
  82
  83 This pipeline shows which users are using each of the configured default
  84 shells, grouped by shell path.
  85
  86   # get the list of users
  87   getent passwd |\
  88
  89   # transform into tabular data stream
  90   tr : "\t" |\
  91   td-add-headers USER X UID GID GECOS HOME SHELL |\
  92
  93   # put the shell in the first column, and sort, then collapse
  94   td-select SHELL USER | td-keepheader sort | td-collapse -g ' ' |\
  95
  96   # change header name "USER" to "USERS"
  97   td-alter USERS=USER | td-select +ALL -USER
  98
  99 B<Output>:
 100
 101   | COUNT | SHELL             | USERS                                        |
 102   | 4     | /bin/bash         | user1 user2 nova root                        |
 103   | 5     | /bin/false        | fetchmail hplip sddm speech-dispatcher sstpc |
 104   | 1     | /bin/sync         | sync                                         |
 105   | 1     | /sbin/rebootlogon | reboot                                       |
 106   | 6     | /usr/sbin/nologin | _apt avahi avahi-autoipd backup bin daemon   |
 107
 108 =head1 CAVEATS
 109
 110 Have to sort input data first.
 111
 112 Group key is always the first input column.
 113
 114 If a row in the input data has more cells than the number of columns, those are ignored.
 115
 116 =head1 SEE ALSO
 117
 118 td-expand(1) is a kind of an inverse to td-collapse(1).
 119
 120 =head1 REFERENCES
 121
 122 td-collapse(1) roughly translates to SELECT COUNT(*) + GROUP_CONCAT() + GROUP BY in SQL.
 123
 124 =cut
 125
 126
 127 $OptGlue = " ";
 128 $OptDistUniqueField = undef;
 129 $OptDistColumnNameSeparator = "_";
 130 $OptKeepSameCells = 0;
 131 $OptEmptyDistCells = 0;
 132 %OptionDefs = (
 133         'g|glue=s' => \$OptGlue,
 134         'u|distribute-unique-field=s' => \$OptDistUniqueField,
 135         's|distributed-column-name-separator=s' => \$OptDistColumnNameSeparator,
 136         'k|keep-equivalent-cells-united!' => \$OptKeepSameCells,
 137         'z|empty-distributed-cells!' => \$OptEmptyDistCells,
 138 );
 139
 140 use Data::Dumper;
 141 use List::MoreUtils qw/all/;
 142 no if ($] >= 5.018), 'warnings' => 'experimental::smartmatch';
 143 do '/usr/lib/tool/perl5/tabdata/common.pl' or die "$@";
 144
 145 process_header(scalar <STDIN>);
 146 $last_input_column_idx = $#Header;
 147
 148 if(defined $OptDistUniqueField)
 149 {
 150         die "$0: no such column: $OptDistUniqueField\n" if not exists $Header{$OptDistUniqueField};
 151         $uniq_field_idx = $Header{$OptDistUniqueField};
 152 }
 153
 154
 155 sub make_collapsed_cell
 156 {
 157         my $first_input_cell = $_[0];
 158         if($OptKeepSameCells and all {$_ eq $first_input_cell} @_)
 159         {
 160                 return $first_input_cell;
 161         }
 162         return join $OptGlue, @_;
 163 }
 164
 165 sub commit_group
 166 {
 167         if($group_count == 0)
 168         {
 169                 if($OptDistUniqueField)
 170                 {
 171                         for my $colidx (1..$#Header)
 172                         {
 173                                 next if $colidx == $uniq_field_idx;
 174                                 @distributed_cells = @{$group_members[$uniq_field_idx]};
 175                                 for my $uniq_cell (@distributed_cells)
 176                                 {
 177                                         my $distrib_colname = $Header[$colidx] . $OptDistColumnNameSeparator . $uniq_cell;
 178                                         push @Header, $distrib_colname;
 179                                         $Header{$distrib_colname} = $#Header;
 180                                 }
 181                         }
 182                 }
 183                 print join($FS, "COUNT", @Header).$RS;
 184         }
 185
 186         if($OptDistUniqueField)
 187         {
 188                 for my $uniq_cell_idx (0..$#{$group_members[$uniq_field_idx]})
 189                 {
 190                         my $uniq_cell = $group_members[$uniq_field_idx]->[$uniq_cell_idx];
 191                         next if not $uniq_cell ~~ @distributed_cells;
 192                         for my $colidx (1..$last_input_column_idx)
 193                         {
 194                                 my $colname = $Header[$colidx];
 195                                 my $distrib_colname = $colname . $OptDistColumnNameSeparator . $uniq_cell;
 196                                 $group_members[$Header{$distrib_colname}]->[0] = $group_members[$colidx]->[$uniq_cell_idx];
 197                                 $group_members[$colidx]->[$uniq_cell_idx] = undef if $OptEmptyDistCells;
 198                         }
 199                         $group_members[$uniq_field_idx]->[$uniq_cell_idx] = undef if $OptEmptyDistCells;
 200                 }
 201                 for my $colidx (1..$last_input_column_idx)
 202                 {
 203                         @{$group_members[$colidx]} = grep {defined} @{$group_members[$colidx]};
 204                 }
 205                 @{$group_members[$uniq_field_idx]} = grep {defined} @{$group_members[$uniq_field_idx]};
 206         }
 207
 208         print $collapsed_rows . $FS . $prev_group_key . $FS . join($FS, map {make_collapsed_cell(@$_)} @group_members[1..$#group_members]) . $RS;
 209         @group_members = ();
 210         $collapsed_rows = 0;
 211         $group_count++;
 212 }
 213
 214 $group_key = undef;
 215 $prev_group_key = undef;
 216 @group_members = ();
 217 $collapsed_rows = 0;
 218 $group_count = 0;
 219 @distributed_cells = ();
 220
 221
 222 while(not eof STDIN)
 223 {
 224         my @input_row = read_record(\*STDIN);
 225
 226         $group_key = $input_row[0];
 227
 228         if(defined $prev_group_key and $group_key ne $prev_group_key)
 229         {
 230                 commit_group;
 231         }
 232
 233         for my $cell_idx (1..$last_input_column_idx)
 234         {
 235                 my $cell = $input_row[$cell_idx];
 236                 $cell = '' unless defined $cell;
 237                 push @{$group_members[$cell_idx]}, $cell;
 238         }
 239
 240         $collapsed_rows++;
 241         $prev_group_key = $group_key;
 242 }
 243
 244 END { commit_group; }