modified: pixi.toml
[GalaxyCodeBases.git] / released / pIRS.old / bwasam / analys_matrix.pl
blob33a842a2050a99da4f56a6ca3e78fc5299ac91e6
1 #!/usr/bin/perl -w
3 =head1 Name
5 error_matrix_analyzer.pl
7 =head1 Description
9 analyse the matrix about reads' characteristic count from comparison results
11 the program need the matrix file as input.It will creat 6 files:
12 *.misVSerr.base.stat : mismatch rate and error rate calculate by quality value for every cycle
13 *.qualVSmis.stat : compair the real mismatch rate and the calculated error rate for each quality value
14 *.transform.cycle.stat : the rate of the reference nucleotide be sequenced to each for nucleotide for every cycle
15 *.transform.avg.stat : the average the reference nucleotide be sequenced to each for nucleotide for all cycle along read
16 *.qual.mat.dis : the distribution of quality value for matched read base
17 *.qual.mis.dis : the distribution of quality value for mismatched read base
19 =head1 Version
21 Author: Shi Yujian , shiyujian@genomics.org.cn
22 Version: 1.1 , Date:2011-8
24 =head1 Usage
26 perl error_matrix_analyzer.pl [option]
27 -i <str> matrix file
28 -o <str> output prefix
29 -m <num> min quality score[default:0]
30 -x <num> max quality score[default:40]
31 -B ignore the bases that quality is B or #
32 -h help
34 =cut
36 use strict;
37 use Getopt::Long;
39 ##get options from command line into variables and set default values
40 my ($len, $in_file, $out, $min_qual, $max_qual, $help, $maskB);
41 GetOptions(
42 "i:s"=>\$in_file,
43 "o:s"=>\$out,
44 "m:i"=>\$min_qual,
45 "x:i"=>\$max_qual,
46 "B"=>\$maskB,
47 "help"=>\$help
50 die `pod2text $0` if($help || !defined $in_file);
51 $min_qual ||= 0;
52 $max_qual ||= 40;
53 if($in_file=~/\/([^\/]+)$/)
55 $out ||= $1;
57 else
59 $out ||= $in_file;
62 my %seq2bit = ("A"=>0,"C"=>1,"G"=>2,"T"=>3);
63 my @bit2seq = ("A","C","G","T");
64 my @matrix;
65 open IN,$in_file || die"$!";
66 while(<IN>)
68 next if(/#/);
69 next unless(/\S+/);
70 chomp;
71 my @line = split;
72 my $k = 2;
73 for(my $i=0;$i<4;$i++)
75 for(my $j=$min_qual;$j<=$max_qual;$j++)
77 $matrix[$seq2bit{$line[0]}][$line[1]-1][$i][$j] = $line[$k];
78 $k++;
80 if($maskB)
82 $matrix[$seq2bit{$line[0]}][$line[1]-1][$i][2] = 0;
86 close IN;
88 my @q2e;
89 for(my $i=$min_qual;$i<=$max_qual;$i++)
91 $q2e[$i] = 10 ** (-$i / 10);
94 my(@sum_row,@sum_cycle,@sum_ref,@sum_cycle_match,@sum_cycle_err,@transform_cycle,@transform_avg);
95 my(@match_qual_cycle,@mis_qual_cycle);
96 my(@match_qual_sum,$match_sum,@mis_qual_sum,$mis_sum,$err_sum);
97 for(my $ref=0;$ref<@matrix;$ref++)
99 for(my $cycle=0;$cycle<@{$matrix[$ref]};$cycle++)
101 for(my $base=0;$base<@{$matrix[$ref][$cycle]};$base++)
103 my $sum_tmp = 0;
104 for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
106 $sum_tmp += $matrix[$ref][$cycle][$base][$qual];
107 my $err_tmp = $matrix[$ref][$cycle][$base][$qual] * $q2e[$qual];
108 $sum_cycle_err[$cycle] += $err_tmp;
109 $err_sum += $err_tmp;
110 if($ref == $base)
112 $sum_cycle_match[$cycle] += $matrix[$ref][$cycle][$base][$qual];
113 $match_qual_cycle[$cycle][$qual] += $matrix[$ref][$cycle][$base][$qual];
114 $match_qual_sum[$qual] += $matrix[$ref][$cycle][$base][$qual];
115 $match_sum += $matrix[$ref][$cycle][$base][$qual];
117 else
119 $mis_qual_cycle[$cycle][$qual] += $matrix[$ref][$cycle][$base][$qual];
120 $mis_qual_sum[$qual] += $matrix[$ref][$cycle][$base][$qual];
121 $mis_sum += $matrix[$ref][$cycle][$base][$qual];
124 $sum_row[$ref][$cycle] += $sum_tmp;
125 $transform_cycle[$cycle][$ref][$base] += $sum_tmp;
126 $transform_avg[$ref][$base] += $sum_tmp;
128 $sum_cycle[$cycle] += $sum_row[$ref][$cycle];
129 $sum_ref[$ref] += $sum_row[$ref][$cycle];
133 #open OUT,">$out\.ratio" || die "$!";
134 my $total_base = $match_sum + $mis_sum;
135 #print OUT "#total_base_number:\t$total_base\n";
136 #print OUT "#base_rario:\t";
137 #for(my $ref=0;$ref<@matrix;$ref++)
139 # my $sum_ref_rate = $sum_ref[$ref] / $total_base;
140 # print OUT "$bit2seq[$ref]:$sum_ref_rate\t";
142 #print OUT "\n#ref_base\tcycle\t";
143 #for(my $base=0;$base<@{$matrix[0][0]};$base++)
145 # for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
147 # print OUT "$bit2seq[$base]$qual\t";
150 #print OUT "\n";
151 #for(my $ref=0;$ref<@matrix;$ref++)
153 # for(my $cycle=0;$cycle<@{$matrix[$ref]};$cycle++)
155 # my $cycle_tmp = $cycle + 1;
156 # print OUT "$bit2seq[$ref]\t$cycle_tmp\t";
157 # for(my $base=0;$base<@{$matrix[$ref][$cycle]};$base++)
159 # for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
161 # my $ratio = 0;
162 # $ratio = $matrix[$ref][$cycle][$base][$qual] / $sum_row[$ref][$cycle] if($sum_row[$ref][$cycle]);
163 # print OUT "$ratio\t";
166 # print OUT "\n";
169 #close OUT;
171 open OUT,">$out\.misVSerr\.base\.stat" || die "$!";
172 my $avg_mis_rate = $mis_sum / $total_base;
173 my $avg_err_rate = $err_sum / $total_base;
174 print OUT "#total_base:\t$total_base\n";
175 print OUT "#avg_mis_rate:\t$avg_mis_rate\n";
176 print OUT "#avg_err_rate:\t$avg_err_rate\n";
177 print OUT "#cycle\tmis_rate\terr_rate\n";
178 for(my $cycle=0;$cycle<@sum_cycle;$cycle++)
180 my $mis_rate_tmp = 1 - $sum_cycle_match[$cycle] / $sum_cycle[$cycle];
181 my $err_rate_tmp = $sum_cycle_err[$cycle] / $sum_cycle[$cycle];
182 my $cycle_tmp = $cycle + 1;
183 print OUT "$cycle_tmp\t$mis_rate_tmp\t$err_rate_tmp\n";
185 close OUT;
187 open OUT,">$out\.qualVSmis\.stat" || die "$!";
188 print OUT "qual\tQ_err\tmis_rate\n";
189 for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
191 my $mis_tmp = 0;
192 $mis_tmp = $mis_qual_sum[$qual] / ($match_qual_sum[$qual]+$mis_qual_sum[$qual]) if($match_qual_sum[$qual]+$mis_qual_sum[$qual]);
193 # my $qual_tmp = chr($qual + 64);
194 my $qual_tmp = $qual;
195 print OUT "$qual_tmp\t$q2e[$qual]\t$mis_tmp\n";
197 close OUT;
199 open OUT,">$out\.transform\.cycle\.stat" || die "$!";
200 open OUT2,">$out\.transform\.avg\.stat" || die "$!";
201 print OUT "#cycle\t";
202 print OUT2 "ref\\read\t";
203 my $flag = 0;
204 for(my $ref=0;$ref<@transform_avg;$ref++)
206 for(my $base=0;$base<@{$transform_avg[$ref]};$base++)
208 # next if($base == $ref);
209 print OUT "$bit2seq[$ref]\-\>$bit2seq[$base]\t";
210 if(!$flag)
212 print OUT2 "$bit2seq[$base]\t";
215 if(!$flag)
217 print OUT2 "\n";
218 $flag=1;
221 print OUT "\n#avg\t";
222 for(my $ref=0;$ref<@transform_avg;$ref++)
224 print OUT2 "$bit2seq[$ref]\t";
225 for(my $base=0;$base<@{$transform_avg[$ref]};$base++)
227 # next if($base == $ref);
228 my $transform_rate_tmp = 0;
229 # $transform_rate_tmp = $transform_avg[$ref][$base] / ($sum_ref[$ref] - $transform_avg[$ref][$ref]) if(($sum_ref[$ref] - $transform_avg[$ref][$ref]));
230 $transform_rate_tmp = $transform_avg[$ref][$base] / $sum_ref[$ref] if($sum_ref[$ref]);
231 print OUT "$transform_rate_tmp\t";
232 print OUT2 "$transform_rate_tmp\t";
234 print OUT2 "\n";
236 close OUT2;
237 print OUT "\n";
238 for(my $cycle=0;$cycle<@transform_cycle;$cycle++)
240 my $cycle_tmp = $cycle + 1;
241 print OUT "$cycle_tmp\t";
242 for(my $ref=0;$ref<@transform_avg;$ref++)
244 for(my $base=0;$base<@{$transform_avg[$ref]};$base++)
246 # next if($base == $ref);
247 my $transform_rate_tmp = 0;
248 # $transform_rate_tmp = $transform_cycle[$cycle][$ref][$base] / ($sum_row[$ref][$cycle] - $transform_cycle[$cycle][$ref][$ref]) if(($sum_row[$ref][$cycle] - $transform_cycle[$cycle][$ref][$ref]));
249 $transform_rate_tmp = $transform_cycle[$cycle][$ref][$base] / $sum_row[$ref][$cycle] if($sum_row[$ref][$cycle]);
250 print OUT "$transform_rate_tmp\t";
253 print OUT "\n";
255 close OUT;
257 open OUT1,">$out\.qual\.mat\.dis" || die "$!";
258 open OUT2,">$out\.qual\.mis\.dis" || die "$!";
259 print OUT1 "#qual\tavg\t";
260 print OUT2 "#qual\tavg\t";
261 for(my $cycle=0;$cycle<@match_qual_cycle;$cycle++)
263 my $cycle_tmp = $cycle + 1;
264 print OUT1 "cycl:$cycle_tmp\t";
265 print OUT2 "cycl:$cycle_tmp\t";
267 print OUT1 "\n";
268 print OUT2 "\n";
269 for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
271 # my $qual_tmp = chr($qual+64);
272 my $match_tmp = $match_qual_sum[$qual] / $match_sum;
273 my $mis_tmp = $mis_qual_sum[$qual] / $mis_sum;
274 print OUT1 "$qual\t$match_tmp\t";
275 print OUT2 "$qual\t$mis_tmp\t";
276 for(my $cycle=0;$cycle<@match_qual_cycle;$cycle++)
278 $match_tmp = 0;
279 $match_tmp = $match_qual_cycle[$cycle][$qual] / $sum_cycle_match[$cycle] if($sum_cycle_match[$cycle]);
280 my $mis_tmp = 0;
281 $mis_tmp = $mis_qual_cycle[$cycle][$qual] / ($sum_cycle[$cycle] - $sum_cycle_match[$cycle]) if($sum_cycle[$cycle] - $sum_cycle_match[$cycle]);
282 print OUT1 "$match_tmp\t";
283 print OUT2 "$mis_tmp\t";
285 print OUT1 "\n";
286 print OUT2 "\n";
288 close OUT1;
289 close OUT2;