modified: myjupyterlab.sh
[GalaxyCodeBases.git] / released / pIRS.old / bwasam / stat-syj.pl
blobab4ba40fee4e5fe099e4977bfc533047f45b79c3
1 #!/usr/bin/perl -w
3 # /ifs1/ST_ASMB/USER/shiyj/work/simulate/stat/stat.pl
5 use strict;
6 use Getopt::Long;
7 ##get options from command line into variables and set default values
8 my ($read_len, $in_file, $snp_file, $stat_out, $help);
9 GetOptions(
10 "l:i"=>\$read_len,
11 "i:s"=>\$in_file,
12 "s:s"=>\$snp_file,
13 "o:s"=>\$stat_out,
14 "help"=>\$help
17 if($help || !defined $read_len || !defined $in_file)
19 print "Name
21 stat.pl --stat matrix of (ref_base,cycle,read_base,quality) through PE soap file
23 Usage
25 perl stat.pl [option]
27 --l\t<num> read length
28 --i\t<str> soap.pair
29 --s\t<str> SNP file [default:none]
30 --o\t<str> output stat file
31 --h\t\thelp
33 exit;
36 $stat_out = $in_file . "\.stat" if(!$stat_out);
38 my $Q_shift=64;
40 my %SNP;
41 if($snp_file)
43 open IN,$ARGV[2] || die "$!";
44 while(<IN>)
46 chomp;
47 my @line=split;
48 $SNP{$line[0]}{$line[1]}=1;
50 close IN;
52 my $read1_len=$read_len;
53 my $read2_len=$read1_len;
55 if($in_file=~/\.gz$/)
57 open IN,"gzip -dc $in_file|" || die "$!";
59 else
61 open IN,$in_file || die "$!";
64 my($read1,$read2,@line1,@line2);
65 #my(%match_stat,%mis_stat);
66 my(@match,@mis,@quality,$mis_len,$ss,$i,$j);
67 my %stat;
69 while($read1=<IN>)
71 chomp $read1;
72 @line1 = split /\s+/,$read1;
73 next if($line1[4] ne "a");
74 $read2 = <IN>;
75 chomp $read2;
76 @line2 = split /\s+/,$read2;
77 if($line2[4] ne "b")
79 print STDERR "The soap result is not pair\n$read1\n$read2\n";
80 next;
82 next if($line1[10]=~/S/ || $line2[10]=~/S/);#滤掉部分比对上的
83 next if($line1[3]>1 || $line2[3]>1);#滤掉非唯一比对的
85 @match = split /[ATCG]+/,$line1[-1];
86 @mis = split /\d+/,$line1[-1];
87 @quality = split //,$line1[2];
88 my $snp;
89 if($line1[-1] =~ /^\d/)
91 shift @mis;
92 if($line1[6] eq "+")
94 $ss = 0;
95 $snp = 0;
96 my @seq = split //,$line1[1];
97 for($i=0;$i<@mis;$i++)
99 for($j=0;$j<$match[$i];$j++)
101 if(!$SNP{$line1[7]}{$line1[8]+$snp})
103 # $match_stat{$quality[$ss]}[$ss]++;
104 $stat{$seq[$ss]}[$ss]{$seq[$ss]}{$quality[$ss]}++;
106 $ss++;
107 $snp++;
109 $mis_len = length($mis[$i]);
110 my @tmp = split //,$mis[$i];
111 for($j=0;$j<$mis_len;$j++)
113 if(!$SNP{$line1[7]}{$line1[8]+$snp})
115 # $mis_stat{$quality[$ss]}[$ss]++;
116 $stat{$tmp[$j]}[$ss]{$seq[$ss]}{$quality[$ss]}++;
118 $ss++;
119 $snp++;
122 if($ss<$read1_len)
124 for($j=0;$j<$match[$i];$j++)
126 if(!$SNP{$line1[7]}{$line1[8]+$snp})
128 # $match_stat{$quality[$ss]}[$ss]++;
129 $stat{$seq[$ss]}[$ss]{$seq[$ss]}{$quality[$ss]}++;
131 $ss++;
132 $snp++;
136 else
138 $ss = $read1_len-1;
139 $snp = 0;
140 $line1[1]=~tr/ATCG/TAGC/;
141 my @seq = split //,$line1[1];
142 for($i=0;$i<@mis;$i++)
144 for($j=0;$j<$match[$i];$j++)
146 if(!$SNP{$line1[7]}{$line1[8]+$snp})
148 $stat{$seq[$read1_len-1-$ss]}[$ss]{$seq[$read1_len-1-$ss]}{$quality[$read1_len-1-$ss]}++;
150 $ss--;
151 $snp++;
153 $mis_len = length($mis[$i]);
154 $mis[$i] =~ tr/ATCG/TAGC/;
155 my @tmp = split //,$mis[$i];
156 for($j=0;$j<$mis_len;$j++)
158 if(!$SNP{$line1[7]}{$line1[8]+$snp})
160 $stat{$tmp[$j]}[$ss]{$seq[$read1_len-1-$ss]}{$quality[$read1_len-1-$ss]}++;
162 $ss--;
163 $snp++;
166 if($ss>=0)
168 for($j=0;$j<$match[$i];$j++)
170 if(!$SNP{$line1[7]}{$line1[8]+$snp})
172 $stat{$seq[$read1_len-1-$ss]}[$ss]{$seq[$read1_len-1-$ss]}{$quality[$read1_len-1-$ss]}++;
174 $ss--;
175 $snp++;
180 else
182 die "not all mismatch information start with number";
184 @match = split /[ATCG]+/,$line2[-1];
185 @mis = split /\d+/,$line2[-1];
186 @quality = split //,$line2[2];
187 if($line2[-1] =~ /^\d/)
189 shift @mis;
190 if($line2[6] eq "+")
192 $ss = $read1_len;
193 $snp = 0;
194 my @seq = split//,$line2[1];
195 for($i=0;$i<@mis;$i++)
197 for($j=0;$j<$match[$i];$j++)
199 if(!$SNP{$line2[7]}{$line2[8]+$snp})
201 $stat{$seq[$ss-$read1_len]}[$ss]{$seq[$ss-$read1_len]}{$quality[$ss-$read1_len]}++;
203 $ss++;
204 $snp++;
206 $mis_len = length($mis[$i]);
207 my @tmp = split //,$mis[$i];
208 for($j=0;$j<$mis_len;$j++)
210 if(!$SNP{$line2[7]}{$line2[8]+$snp})
212 $stat{$tmp[$j]}[$ss]{$seq[$ss-$read1_len]}{$quality[$ss-$read1_len]}++;
214 $ss++;
215 $snp++;
218 if($ss<$read1_len+$read2_len)
220 for($j=0;$j<$match[$i];$j++)
222 if(!$SNP{$line2[7]}{$line2[8]+$snp})
224 $stat{$seq[$ss-$read1_len]}[$ss]{$seq[$ss-$read1_len]}{$quality[$ss-$read1_len]}++;
226 $ss++;
227 $snp++;
231 else
233 $ss = $read1_len+$read2_len-1;
234 $snp = 0;
235 $line2[1] =~ tr/ATCG/TAGC/;
236 my @seq = split //,$line2[1];
237 for($i=0;$i<@mis;$i++)
239 for($j=0;$j<$match[$i];$j++)
241 if(!$SNP{$line2[7]}{$line2[8]+$snp})
243 $stat{$seq[$read1_len+$read2_len-1-$ss]}[$ss]{$seq[$read1_len+$read2_len-1-$ss]}{$quality[$read1_len+$read2_len-1-$ss]}++;
245 $ss--;
246 $snp++;
248 $mis_len = length($mis[$i]);
249 $mis[$i] =~ tr/ATCG/TAGC/;
250 my @tmp = split //,$mis[$i];
251 for($j=0;$j<$mis_len;$j++)
253 if(!$SNP{$line2[7]}{$line2[8]+$snp})
255 $stat{$tmp[$j]}[$ss]{$seq[$read1_len+$read2_len-1-$ss]}{$quality[$read1_len+$read2_len-1-$ss]}++;
257 $ss--;
258 $snp++;
261 if($ss>=$read1_len)
263 for($j=0;$j<$match[$i];$j++)
265 if(!$SNP{$line2[7]}{$line2[8]+$snp})
267 $stat{$seq[$read1_len+$read2_len-1-$ss]}[$ss]{$seq[$read1_len+$read2_len-1-$ss]}{$quality[$read1_len+$read2_len-1-$ss]}++;
269 $ss--;
270 $snp++;
275 else
277 die "not all mismatch information start with number";
280 close IN;
282 open OUT,">$stat_out" || die "$!";
283 my(@qual,@cycle,@ref,@base);
284 for($i=0;$i<41;$i++)
286 $qual[$i] = chr($i+$Q_shift);
288 for($i=0;$i<$read1_len+$read2_len;$i++)
290 $cycle[$i]=$i;
292 $ref[0]="A";
293 $ref[1]="C";
294 $ref[2]="G";
295 $ref[3]="T";
296 $base[0]="A";
297 $base[1]="C";
298 $base[2]="G";
299 $base[3]="T";
301 my($r,$c,$b,$q);
302 print OUT "#ref_base\tcycle\t";
303 foreach $b (@base)
305 for($i=0;$i<41;$i++)
307 print OUT "$b$i\t";
310 print OUT "\n";
311 foreach $r (@ref)
313 foreach $c (@cycle)
315 my $c_tmp = $c + 1;
316 print OUT "$r\t$c_tmp\t";
317 foreach $b (@base)
319 foreach $q (@qual)
321 $stat{$r}[$c]{$b}{$q}=0 if(!$stat{$r}[$c]{$b}{$q});
322 print OUT "$stat{$r}[$c]{$b}{$q}\t";
325 print OUT "\n";
329 close OUT;