5 error_matrix_analyzer.pl
9 analyse the matrix about reads' characteristic count from comparison results
11 the program need the matrix file as input.It will creat 6 files:
12 *.misVSerr.base.stat : mismatch rate and error rate calculate by quality value for every cycle
13 *.qualVSmis.stat : compair the real mismatch rate and the calculated error rate for each quality value
14 *.transform.cycle.stat : the rate of the reference nucleotide be sequenced to each for nucleotide for every cycle
15 *.transform.avg.stat : the average the reference nucleotide be sequenced to each for nucleotide for all cycle along read
16 *.qual.mat.dis : the distribution of quality value for matched read base
17 *.qual.mis.dis : the distribution of quality value for mismatched read base
21 Author: Shi Yujian , shiyujian@genomics.org.cn
22 Version: 1.1 , Date:2011-8
26 perl error_matrix_analyzer.pl [option]
28 -o <str> output prefix
29 -m <num> min quality score[default:0]
30 -x <num> max quality score[default:40]
31 -B ignore the bases that quality is B or #
39 ##get options from command line into variables and set default values
40 my ($len, $in_file, $out, $min_qual, $max_qual, $help, $maskB);
50 die `pod2text $0` if($help || !defined $in_file);
53 if($in_file=~/\/([^\
/]+)$/)
62 my %seq2bit = ("A"=>0,"C"=>1,"G"=>2,"T"=>3);
63 my @bit2seq = ("A","C","G","T");
65 open IN
,$in_file || die"$!";
73 for(my $i=0;$i<4;$i++)
75 for(my $j=$min_qual;$j<=$max_qual;$j++)
77 $matrix[$seq2bit{$line[0]}][$line[1]-1][$i][$j] = $line[$k];
82 $matrix[$seq2bit{$line[0]}][$line[1]-1][$i][2] = 0;
89 for(my $i=$min_qual;$i<=$max_qual;$i++)
91 $q2e[$i] = 10 ** (-$i / 10);
94 my(@sum_row,@sum_cycle,@sum_ref,@sum_cycle_match,@sum_cycle_err,@transform_cycle,@transform_avg);
95 my(@match_qual_cycle,@mis_qual_cycle);
96 my(@match_qual_sum,$match_sum,@mis_qual_sum,$mis_sum,$err_sum);
97 for(my $ref=0;$ref<@matrix;$ref++)
99 for(my $cycle=0;$cycle<@
{$matrix[$ref]};$cycle++)
101 for(my $base=0;$base<@
{$matrix[$ref][$cycle]};$base++)
104 for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
106 $sum_tmp += $matrix[$ref][$cycle][$base][$qual];
107 my $err_tmp = $matrix[$ref][$cycle][$base][$qual] * $q2e[$qual];
108 $sum_cycle_err[$cycle] += $err_tmp;
109 $err_sum += $err_tmp;
112 $sum_cycle_match[$cycle] += $matrix[$ref][$cycle][$base][$qual];
113 $match_qual_cycle[$cycle][$qual] += $matrix[$ref][$cycle][$base][$qual];
114 $match_qual_sum[$qual] += $matrix[$ref][$cycle][$base][$qual];
115 $match_sum += $matrix[$ref][$cycle][$base][$qual];
119 $mis_qual_cycle[$cycle][$qual] += $matrix[$ref][$cycle][$base][$qual];
120 $mis_qual_sum[$qual] += $matrix[$ref][$cycle][$base][$qual];
121 $mis_sum += $matrix[$ref][$cycle][$base][$qual];
124 $sum_row[$ref][$cycle] += $sum_tmp;
125 $transform_cycle[$cycle][$ref][$base] += $sum_tmp;
126 $transform_avg[$ref][$base] += $sum_tmp;
128 $sum_cycle[$cycle] += $sum_row[$ref][$cycle];
129 $sum_ref[$ref] += $sum_row[$ref][$cycle];
133 #open OUT,">$out\.ratio" || die "$!";
134 my $total_base = $match_sum + $mis_sum;
135 #print OUT "#total_base_number:\t$total_base\n";
136 #print OUT "#base_rario:\t";
137 #for(my $ref=0;$ref<@matrix;$ref++)
139 # my $sum_ref_rate = $sum_ref[$ref] / $total_base;
140 # print OUT "$bit2seq[$ref]:$sum_ref_rate\t";
142 #print OUT "\n#ref_base\tcycle\t";
143 #for(my $base=0;$base<@{$matrix[0][0]};$base++)
145 # for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
147 # print OUT "$bit2seq[$base]$qual\t";
151 #for(my $ref=0;$ref<@matrix;$ref++)
153 # for(my $cycle=0;$cycle<@{$matrix[$ref]};$cycle++)
155 # my $cycle_tmp = $cycle + 1;
156 # print OUT "$bit2seq[$ref]\t$cycle_tmp\t";
157 # for(my $base=0;$base<@{$matrix[$ref][$cycle]};$base++)
159 # for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
162 # $ratio = $matrix[$ref][$cycle][$base][$qual] / $sum_row[$ref][$cycle] if($sum_row[$ref][$cycle]);
163 # print OUT "$ratio\t";
171 open OUT
,">$out\.misVSerr\.base\.stat" || die "$!";
172 my $avg_mis_rate = $mis_sum / $total_base;
173 my $avg_err_rate = $err_sum / $total_base;
174 print OUT
"#total_base:\t$total_base\n";
175 print OUT
"#avg_mis_rate:\t$avg_mis_rate\n";
176 print OUT
"#avg_err_rate:\t$avg_err_rate\n";
177 print OUT
"#cycle\tmis_rate\terr_rate\n";
178 for(my $cycle=0;$cycle<@sum_cycle;$cycle++)
180 my $mis_rate_tmp = 1 - $sum_cycle_match[$cycle] / $sum_cycle[$cycle];
181 my $err_rate_tmp = $sum_cycle_err[$cycle] / $sum_cycle[$cycle];
182 my $cycle_tmp = $cycle + 1;
183 print OUT
"$cycle_tmp\t$mis_rate_tmp\t$err_rate_tmp\n";
187 open OUT
,">$out\.qualVSmis\.stat" || die "$!";
188 print OUT
"qual\tQ_err\tmis_rate\n";
189 for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
192 $mis_tmp = $mis_qual_sum[$qual] / ($match_qual_sum[$qual]+$mis_qual_sum[$qual]) if($match_qual_sum[$qual]+$mis_qual_sum[$qual]);
193 # my $qual_tmp = chr($qual + 64);
194 my $qual_tmp = $qual;
195 print OUT
"$qual_tmp\t$q2e[$qual]\t$mis_tmp\n";
199 open OUT
,">$out\.transform\.cycle\.stat" || die "$!";
200 open OUT2
,">$out\.transform\.avg\.stat" || die "$!";
201 print OUT
"#cycle\t";
202 print OUT2
"ref\\read\t";
204 for(my $ref=0;$ref<@transform_avg;$ref++)
206 for(my $base=0;$base<@
{$transform_avg[$ref]};$base++)
208 # next if($base == $ref);
209 print OUT
"$bit2seq[$ref]\-\>$bit2seq[$base]\t";
212 print OUT2
"$bit2seq[$base]\t";
221 print OUT
"\n#avg\t";
222 for(my $ref=0;$ref<@transform_avg;$ref++)
224 print OUT2
"$bit2seq[$ref]\t";
225 for(my $base=0;$base<@
{$transform_avg[$ref]};$base++)
227 # next if($base == $ref);
228 my $transform_rate_tmp = 0;
229 # $transform_rate_tmp = $transform_avg[$ref][$base] / ($sum_ref[$ref] - $transform_avg[$ref][$ref]) if(($sum_ref[$ref] - $transform_avg[$ref][$ref]));
230 $transform_rate_tmp = $transform_avg[$ref][$base] / $sum_ref[$ref] if($sum_ref[$ref]);
231 print OUT
"$transform_rate_tmp\t";
232 print OUT2
"$transform_rate_tmp\t";
238 for(my $cycle=0;$cycle<@transform_cycle;$cycle++)
240 my $cycle_tmp = $cycle + 1;
241 print OUT
"$cycle_tmp\t";
242 for(my $ref=0;$ref<@transform_avg;$ref++)
244 for(my $base=0;$base<@
{$transform_avg[$ref]};$base++)
246 # next if($base == $ref);
247 my $transform_rate_tmp = 0;
248 # $transform_rate_tmp = $transform_cycle[$cycle][$ref][$base] / ($sum_row[$ref][$cycle] - $transform_cycle[$cycle][$ref][$ref]) if(($sum_row[$ref][$cycle] - $transform_cycle[$cycle][$ref][$ref]));
249 $transform_rate_tmp = $transform_cycle[$cycle][$ref][$base] / $sum_row[$ref][$cycle] if($sum_row[$ref][$cycle]);
250 print OUT
"$transform_rate_tmp\t";
257 open OUT1
,">$out\.qual\.mat\.dis" || die "$!";
258 open OUT2
,">$out\.qual\.mis\.dis" || die "$!";
259 print OUT1
"#qual\tavg\t";
260 print OUT2
"#qual\tavg\t";
261 for(my $cycle=0;$cycle<@match_qual_cycle;$cycle++)
263 my $cycle_tmp = $cycle + 1;
264 print OUT1
"cycl:$cycle_tmp\t";
265 print OUT2
"cycl:$cycle_tmp\t";
269 for(my $qual=$min_qual;$qual<=$max_qual;$qual++)
271 # my $qual_tmp = chr($qual+64);
272 my $match_tmp = $match_qual_sum[$qual] / $match_sum;
273 my $mis_tmp = $mis_qual_sum[$qual] / $mis_sum;
274 print OUT1
"$qual\t$match_tmp\t";
275 print OUT2
"$qual\t$mis_tmp\t";
276 for(my $cycle=0;$cycle<@match_qual_cycle;$cycle++)
279 $match_tmp = $match_qual_cycle[$cycle][$qual] / $sum_cycle_match[$cycle] if($sum_cycle_match[$cycle]);
281 $mis_tmp = $mis_qual_cycle[$cycle][$qual] / ($sum_cycle[$cycle] - $sum_cycle_match[$cycle]) if($sum_cycle[$cycle] - $sum_cycle_match[$cycle]);
282 print OUT1
"$match_tmp\t";
283 print OUT2
"$mis_tmp\t";