new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / metmerge.pl
blobda743f26f2bfdd092f80cf883427000c40e2ba7b
1 #!/bin/env perl
2 use strict;
3 use warnings;
5 use Data::Dumper;
7 die "Usage: $0 <mode(pe,se,total)> <input> <output>\n" if @ARGV != 3 or $ARGV[0] !~ /pe|se|total/;
9 my ($Mode,$Infile,$Outfile)=@ARGV;
10 warn "Mode=[$Mode], [$Infile]->[$Outfile]\n";
12 if ($Infile=~/.bz2$/) {
13 open( IN,"-|","bzip2 -dc $Infile") or die "Error opening $Infile: $!\n";
14 } elsif ($Infile=~/.gz$/) {
15 open( IN,"-|","gzip -dc $Infile") or die "Error opening $Infile: $!\n";
16 } else {open( IN,"<",$Infile) or die "Error opening $Infile: $!\n";}
18 sub splitLine($$) {
19 my ($line,$isPE)=@_;
20 my @lineItem=split /\t/,$line;
21 my @Head=split /:/,$lineItem[0];
22 my $CW=pop @Head;
23 pop @Head; pop @Head; # >FCC01P5ACXX:8:1101:10000:187253#CGATGTAT/1:rep1:+:len68:W
24 my $rep=pop @Head;
25 $rep =~ s/^rep// or die "[x]ID format error.\n";
26 my $ID=join ':',@Head;
27 if ($isPE) {
28 $ID =~ s#/[12]$##;
30 return [$ID,$CW,$rep,\@lineItem];
33 sub mergePEitems($) {
34 my @Items=@{$_[0]};
35 die "[x]PE lines missing.\n" if scalar @Items % 2;
36 my $Paires = (scalar @Items) / 2;
37 my (@OutItem,@a,@b);
38 for (my $i=0;$i<$Paires;$i++) {
39 @a=@{$Items[$i]};
40 @b=@{$Items[$i + $Paires]};
41 die "[x]PE data error.\n" if ($a[1] ne $b[1]) or ($a[2] ne $b[2]);
42 push @OutItem,[@a,$b[3]];
44 return \@OutItem;
47 my $lastLine;
48 sub ReadItems($) {
49 my ($isPE)=@_;
50 my ($lastID,$id,$line,@Items,@dat)=('','');
52 if (defined $lastLine) {
53 $line=$lastLine;
54 $lastLine=undef;
55 } else {
56 $line=<IN>;
57 return [] unless defined $line;
58 chomp $line;
60 ($id,@dat)=@{&splitLine($line,$isPE)};
61 $lastID=$id;
62 push @Items,[$id,@dat];
64 while ($lastID eq $id) {
65 $line=<IN>;
66 last unless defined $line;
67 chomp $line;
68 ($id,@dat)=@{&splitLine($line,$isPE)};
69 if ($lastID eq $id) {
70 push @Items,[$id,@dat];
71 } else {
72 $lastLine=$line;
75 if ($isPE) {
76 my $ref=mergePEitems(\@Items);
77 return $ref; # [] of [$ID,$CW,$rep,\@lineItem1,\@lineItem2]
78 } else {
79 return \@Items; # [] of [$ID,$CW,$rep,\@lineItem]
83 sub splitTLine($) {
84 my ($line)=@_;
85 my @lineItem=split /\t/,$line;
86 my $ID=$lineItem[0]; # >FCC01P5ACXX:8:1101:10000:15797#CGATGTAT/
87 my $t=@lineItem;
88 if ($t == 19) { # PE
89 return [$ID,1,\@lineItem];
90 } elsif ($t == 12) { # SE
91 return [$ID,0,\@lineItem];
92 } else {
93 die "[x]Line format error.\n";
96 sub LoadTItems() {
97 my ($lastID,$id,$hasPE,$line,@Items,@dat,$isPE)=('','',0);
99 if (defined $lastLine) {
100 $line=$lastLine;
101 $lastLine=undef;
102 } else {
103 $line=<IN>;
104 return [] unless defined $line;
105 chomp $line;
107 ($id,$isPE,@dat)=@{&splitTLine($line)};
108 $lastID=$id;
109 $hasPE=1 if $isPE;
110 push @Items,[$id,$isPE,@dat];
112 while ($lastID eq $id) {
113 $line=<IN>;
114 last unless defined $line;
115 chomp $line;
116 ($id,$isPE,@dat)=@{&splitTLine($line)};
117 if ($lastID eq $id) {
118 $hasPE=1 if $isPE;
119 push @Items,[$id,$isPE,@dat];
120 } else {
121 $lastLine=$line;
124 return [$hasPE,\@Items]; # \@Items => [] of [$ID,$isPE,\@lineItem]
127 open OUT,'>',$Outfile or die "Error opening $Outfile: $!\n";
129 sub main_se() {
130 my @dat;
131 my $count=0;
132 while(@dat=@{&ReadItems(0)}) {
133 ++$count;
134 print Dumper(\@dat),'-' x 75,"$count\n";
136 warn "\nTotal Groups: $count\n";
138 sub main_pe() {
139 my @dat;
140 my $count=0;
141 while(@dat=@{&ReadItems(1)}) {
142 ++$count;
143 print Dumper(\@dat),'-' x 75,"$count\n"
145 warn "\nTotal Groups: $count\n";
147 sub main_total() {
148 my (@dat,$hasPE,$datref);
149 my $count=0;
150 while(($hasPE,$datref)=@{&LoadTItems()}) {
151 ++$count;
152 @dat=@$datref;
153 print Dumper(\@dat),'-' x 75,"$count\n";
154 unless ($hasPE) {
155 for (@dat) {
156 print OUT join("\t",@{$$_[2]}),"\n";
158 } else {
159 for (@dat) {
160 if ($$_[1] == 1) {
161 print OUT join("\t",@{$$_[2]}),"\n";
162 last;
167 warn "\nTotal Groups: $count\n";
170 if ($Mode eq 'pe') {
171 main_pe();
172 } elsif ($Mode eq 'se') {
173 main_se();
174 } elsif ($Mode eq 'total') {
175 main_total();
178 close IN;
179 close OUT;