3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 0.1.1 @ 20110819
6 #use lib "/ifs1/ST_ASMB/USER/huxuesong/public/lib";
9 use Time
::HiRes qw
( gettimeofday tv_interval
);
18 \t-o output prefix (./allmatrix).{count,ratio}.matrix
19 \t-b No pause for batch runs
21 our $ARG_DESC='matrix_count_files';
24 $opt_o='./allmatrix' if ! $opt_o;
26 print STDERR
"From [@ARGV] to [$opt_o]\n";
27 unless ($opt_b) {print STDERR
'press [Enter] to continue...'; <STDIN
>;}
29 my $start_time = [gettimeofday
];
33 my ($TotalReads,$TotalBase,$MisBase,%BaseCountTypeRef)=(0,0,0);
34 my ($mapBase,$mapReads,$QBbase,$QBmis)=(0,0,0,0);
36 my %Stat; # $Stat{Ref}{Cycle}{Read-Quality}
39 if (/^#Input \[(\w+)\] file of mapped Reads: (\d+) , mapped Bases (\d+) \(no base stat for sam files\)$/) {
40 $type=$1 if $type ne 'sam';
42 $mapBase += $3 if $type ne 'sam';
44 if (/^#Total statistical Bases: (\d+) , Reads: (\d+) of ReadLength (\d+)$/) {
45 print " >$ARGV| Reads:$2 ReadLen:$3 Bases:$1\n";
47 $READLEN = $3 if $READLEN < $3;
49 if (/^#Ref\tCycle\t/) {
52 (undef,undef,@BQHeader)=split /\t/;
53 pop @BQHeader if $BQHeader[-1] eq 'RowSum';
55 if (/^#Dimensions:.+?Quality_number (\d+)$/) {
56 $Qcount = $1 if $Qcount<$1;
58 if (/^#Mismatch_base: (\d+)/) {
61 if (/^#QB_Bases: (\d+), QB_Mismatches: (\d+)/) {
68 my ($ref,$cycle,@BQ)=split /\t/;
69 #print "$ref,$cycle,@BQ\n";
70 next unless $ref =~ /^[ATCG]$/;
71 #die "[$_]\n$ref,$cycle,[@BQ]\n$#BQ < $#BQHeader " if $#BQ < $#BQHeader;
72 for my $key (@BQHeader) {
74 $Stat{$ref}{$cycle}{$key}+=$value;
75 $BaseCountTypeRef{$ref}+=$value;
77 #print "{$ref}{$cycle}{$key}$value\n";
80 #print $TotalReads,"\t",$READLEN,"\n";
81 #print join("\t",@BQHeader),"\n";
82 open OA
,'>',$opt_o.'.count.matrix' or die "Error: $!\n";
83 open OB
,'>',$opt_o.'.ratio.matrix' or die "Error: $!\n";
85 chomp(my $user=`id -nru`);
86 @ARGV=('/etc/passwd');
88 ($_)=grep /$user/,@passwd;
91 $mail=" <$tmp>" unless $tmp =~ /^\s*$/;
92 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
93 my $date=sprintf "%02d:%02d:%02d,%4d-%02d-%02d",$hour,$min,$sec,$year+1900,$mon+1,$mday;
95 my $MisRate=100*$MisBase/$TotalBase;
96 $tmp="#Generate @ $date by ${user}$mail
97 #Input [$type] file of Reads: $mapReads , Bases $mapBase (no base stat for sam files)
98 #Total statistical Bases: $TotalBase , Reads: $TotalReads of ReadLength $READLEN
99 #Dimensions: Ref_base_number 4, Cycle_number $Cycle, Seq_base_number 4, Quality_number $Qcount
100 #Mismatch_base: $MisBase, Mismatch_rate: $MisRate %
101 #QB_Bases: $QBbase, QB_Mismatches: $QBmis (bases with quality <= 2)
102 #Reference Base Ratio in reads: ";
103 my @BaseOrder=sort keys %BaseCountTypeRef; # qw{A T C G};
105 $tmp .= $_.' '. int(0.5+100*1000*$BaseCountTypeRef{$_}/$TotalBase)/1000 .' %; ';
108 $tmp .= "\n#".join("\t",'Ref','Cycle',@BQHeader);
113 my ($count,$countsum);
114 for my $ref (@BaseOrder) {
116 for my $cycle (sort {$a<=>$b} keys %{$Stat{$ref}}) {
117 $tmp="$ref\t$cycle\t";
118 print OA
$tmp; print OB
$tmp;
119 my (@Counts,@Rates)=();
120 for my $bq (@BQHeader) {
121 push @Counts,$Stat{$ref}{$cycle}{$bq};
123 #print "[",join("|",@Counts),"\n";
125 $countsum += $_ for @Counts;
126 push @Rates,$_/$countsum for @Counts;
127 print OA
join("\t",@Counts,$countsum),"\n";
128 print OB
join("\t",@Rates),"\n";
135 my $stop_time = [gettimeofday
];
137 print STDERR
"\nTime Elapsed:\t",tv_interval
( $start_time, $stop_time )," second(s).\n";