3 # /ifs1/ST_ASMB/USER/shiyj/work/simulate/stat/stat.pl
7 ##get options from command line into variables and set default values
8 my ($read_len, $in_file, $snp_file, $stat_out, $help);
17 if($help || !defined $read_len || !defined $in_file)
21 stat.pl --stat matrix of (ref_base,cycle,read_base,quality) through PE soap file
27 --l\t<num> read length
29 --s\t<str> SNP file [default:none]
30 --o\t<str> output stat file
36 $stat_out = $in_file . "\.stat" if(!$stat_out);
43 open IN
,$ARGV[2] || die "$!";
48 $SNP{$line[0]}{$line[1]}=1;
52 my $read1_len=$read_len;
53 my $read2_len=$read1_len;
57 open IN
,"gzip -dc $in_file|" || die "$!";
61 open IN
,$in_file || die "$!";
64 my($read1,$read2,@line1,@line2);
65 #my(%match_stat,%mis_stat);
66 my(@match,@mis,@quality,$mis_len,$ss,$i,$j);
72 @line1 = split /\s+/,$read1;
73 next if($line1[4] ne "a");
76 @line2 = split /\s+/,$read2;
79 print STDERR
"The soap result is not pair\n$read1\n$read2\n";
82 next if($line1[10]=~/S/ || $line2[10]=~/S/);#滤掉部分比对上的
83 next if($line1[3]>1 || $line2[3]>1);#滤掉非唯一比对的
85 @match = split /[ATCG]+/,$line1[-1];
86 @mis = split /\d+/,$line1[-1];
87 @quality = split //,$line1[2];
89 if($line1[-1] =~ /^\d/)
96 my @seq = split //,$line1[1];
97 for($i=0;$i<@mis;$i++)
99 for($j=0;$j<$match[$i];$j++)
101 if(!$SNP{$line1[7]}{$line1[8]+$snp})
103 # $match_stat{$quality[$ss]}[$ss]++;
104 $stat{$seq[$ss]}[$ss]{$seq[$ss]}{$quality[$ss]}++;
109 $mis_len = length($mis[$i]);
110 my @tmp = split //,$mis[$i];
111 for($j=0;$j<$mis_len;$j++)
113 if(!$SNP{$line1[7]}{$line1[8]+$snp})
115 # $mis_stat{$quality[$ss]}[$ss]++;
116 $stat{$tmp[$j]}[$ss]{$seq[$ss]}{$quality[$ss]}++;
124 for($j=0;$j<$match[$i];$j++)
126 if(!$SNP{$line1[7]}{$line1[8]+$snp})
128 # $match_stat{$quality[$ss]}[$ss]++;
129 $stat{$seq[$ss]}[$ss]{$seq[$ss]}{$quality[$ss]}++;
140 $line1[1]=~tr/ATCG/TAGC/;
141 my @seq = split //,$line1[1];
142 for($i=0;$i<@mis;$i++)
144 for($j=0;$j<$match[$i];$j++)
146 if(!$SNP{$line1[7]}{$line1[8]+$snp})
148 $stat{$seq[$read1_len-1-$ss]}[$ss]{$seq[$read1_len-1-$ss]}{$quality[$read1_len-1-$ss]}++;
153 $mis_len = length($mis[$i]);
154 $mis[$i] =~ tr/ATCG/TAGC/;
155 my @tmp = split //,$mis[$i];
156 for($j=0;$j<$mis_len;$j++)
158 if(!$SNP{$line1[7]}{$line1[8]+$snp})
160 $stat{$tmp[$j]}[$ss]{$seq[$read1_len-1-$ss]}{$quality[$read1_len-1-$ss]}++;
168 for($j=0;$j<$match[$i];$j++)
170 if(!$SNP{$line1[7]}{$line1[8]+$snp})
172 $stat{$seq[$read1_len-1-$ss]}[$ss]{$seq[$read1_len-1-$ss]}{$quality[$read1_len-1-$ss]}++;
182 die "not all mismatch information start with number";
184 @match = split /[ATCG]+/,$line2[-1];
185 @mis = split /\d+/,$line2[-1];
186 @quality = split //,$line2[2];
187 if($line2[-1] =~ /^\d/)
194 my @seq = split//,$line2[1];
195 for($i=0;$i<@mis;$i++)
197 for($j=0;$j<$match[$i];$j++)
199 if(!$SNP{$line2[7]}{$line2[8]+$snp})
201 $stat{$seq[$ss-$read1_len]}[$ss]{$seq[$ss-$read1_len]}{$quality[$ss-$read1_len]}++;
206 $mis_len = length($mis[$i]);
207 my @tmp = split //,$mis[$i];
208 for($j=0;$j<$mis_len;$j++)
210 if(!$SNP{$line2[7]}{$line2[8]+$snp})
212 $stat{$tmp[$j]}[$ss]{$seq[$ss-$read1_len]}{$quality[$ss-$read1_len]}++;
218 if($ss<$read1_len+$read2_len)
220 for($j=0;$j<$match[$i];$j++)
222 if(!$SNP{$line2[7]}{$line2[8]+$snp})
224 $stat{$seq[$ss-$read1_len]}[$ss]{$seq[$ss-$read1_len]}{$quality[$ss-$read1_len]}++;
233 $ss = $read1_len+$read2_len-1;
235 $line2[1] =~ tr/ATCG/TAGC/;
236 my @seq = split //,$line2[1];
237 for($i=0;$i<@mis;$i++)
239 for($j=0;$j<$match[$i];$j++)
241 if(!$SNP{$line2[7]}{$line2[8]+$snp})
243 $stat{$seq[$read1_len+$read2_len-1-$ss]}[$ss]{$seq[$read1_len+$read2_len-1-$ss]}{$quality[$read1_len+$read2_len-1-$ss]}++;
248 $mis_len = length($mis[$i]);
249 $mis[$i] =~ tr/ATCG/TAGC/;
250 my @tmp = split //,$mis[$i];
251 for($j=0;$j<$mis_len;$j++)
253 if(!$SNP{$line2[7]}{$line2[8]+$snp})
255 $stat{$tmp[$j]}[$ss]{$seq[$read1_len+$read2_len-1-$ss]}{$quality[$read1_len+$read2_len-1-$ss]}++;
263 for($j=0;$j<$match[$i];$j++)
265 if(!$SNP{$line2[7]}{$line2[8]+$snp})
267 $stat{$seq[$read1_len+$read2_len-1-$ss]}[$ss]{$seq[$read1_len+$read2_len-1-$ss]}{$quality[$read1_len+$read2_len-1-$ss]}++;
277 die "not all mismatch information start with number";
282 open OUT
,">$stat_out" || die "$!";
283 my(@qual,@cycle,@ref,@base);
286 $qual[$i] = chr($i+$Q_shift);
288 for($i=0;$i<$read1_len+$read2_len;$i++)
302 print OUT
"#ref_base\tcycle\t";
316 print OUT
"$r\t$c_tmp\t";
321 $stat{$r}[$c]{$b}{$q}=0 if(!$stat{$r}[$c]{$b}{$q});
322 print OUT
"$stat{$r}[$c]{$b}{$q}\t";