modified: pixi.toml
[GalaxyCodeBases.git] / released / pIRS.old / bwasam / matrixstat.pl
blobf5e5cf13db602d32d4b45f25a51db83adbae76f3
1 #!/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 0.1.1 @ 20110819
5 =cut
6 #use lib "/ifs1/ST_ASMB/USER/huxuesong/public/lib";
7 #use lib '/export/data0/gentoo/tmp';
8 use strict;
9 use warnings;
10 use Time::HiRes qw ( gettimeofday tv_interval );
11 use Galaxy::ShowHelp;
13 $main::VERSION=0.1.1;
14 our $opts='r:o:l:p:s:c:b';
15 our($opt_o, $opt_r, $opt_l, $opt_p, $opt_s, $opt_c, $opt_b);
17 #our $desc='';
18 our $help=<<EOH;
19 \t-p type of input files {(auto),sam,soap}
20 \t-r ref fasta file (./ref/human.fa) [.{gz,bz2} is OK]
21 \t-s trim SNP positions from (<filename>) in format /^ChrID\\tPos/
22 \t-l read length of reads (100)
23 \t-o output prefix (./matrix).{count,ratio}.matrix
24 \t-c ChrID list (./chrtouse)
25 \t-b No pause for batch runs
26 For gzipped files, use zcat and pipe(|).
27 EOH
28 our $ARG_DESC='{sam,soap}pe_files';
30 ShowHelp();
31 $opt_r='./ref/human.fa' if ! $opt_r;
32 $opt_p='auto' if ! $opt_p;
33 $opt_o='./matrix' if ! $opt_o;
34 #$opt_c='./chrtouse' if ! $opt_c;
35 $opt_l=100 if ! $opt_l;
36 die "[x]-r $opt_r not exists !\n" unless -f $opt_r;
37 if ($opt_s) {die "[x]-s $opt_s not exists !\n" unless -f $opt_s;}
39 print STDERR "From [@ARGV]($opt_p) of [$opt_l] with [$opt_r][$opt_s] to [$opt_o]\n";
40 print STDERR "ChrID list:[$opt_c]\n" if $opt_c;
41 unless ($opt_b) {print STDERR "Wait 3 seconds to continue...\n"; sleep 3;}
43 my $start_time = [gettimeofday];
44 #BEGIN
46 my %Genome;
47 if ($opt_c) {
48 open C,'<',$opt_c or die "Error: $!\n";
49 while(<C>){
50 chomp;
51 ++$Genome{$_};
53 close C;
55 warn "[!]Reading Reference Genome:\n";
56 if ($opt_r =~ /.bz2$/) {
57 open( GENOME,"-|","bzip2 -dc $opt_r") or die "Error opening $opt_r: $!\n";
58 } elsif ($opt_r =~ /.gz$/) {
59 open( GENOME,"-|","gzip -dc $opt_r") or die "Error opening $opt_r: $!\n";
60 } else {open( GENOME,"<",$opt_r) or die "Error opening $opt_r: $!\n";}
61 #open GENOME,'<',$opt_r or die "Error: $!\n";
62 while (<GENOME>) {
63 s/^>//;
64 /^(\S+)/ or next;
65 my $seqname = $1;
66 print STDERR " >$seqname ...";
67 $/=">";
68 my $genome=<GENOME>;
69 chomp $genome;
70 $genome=~s/\s//g;
71 $/="\n";
72 if ((!$opt_c) or exists $Genome{$seqname}) {
73 $Genome{$seqname}=$genome;
74 print STDERR "\b\b\b",length $Genome{$seqname},".\n";
75 } else {print STDERR "\b\b\b",length $genome,", skipped.\n";}
76 $genome='';
78 close GENOME;
79 if ($opt_s) {
80 print STDERR "[!]Reading SNP: ";
81 open SNP,'<',$opt_s or die "Error: $!\n";
82 while(<SNP>) {
83 chomp;
84 my ($chr,$pos)=split /\s+/;
85 substr $Genome{$chr},$pos-1,1,'x' if exists $Genome{$chr};
87 close SNP;
88 print STDERR "done.\n";
90 ###
91 #print ">$_\n$Genome{$_}\n\n" for sort keys %Genome;
92 ###
93 sub getBases($$$) {
94 my ($chr,$start,$len)=@_;
95 return substr $Genome{$chr},$start-1,$len;
98 my $READLEN=$opt_l;
99 my $MaxQ=40;
100 my $MisBase=0;
102 my ($TotalBase,$TotalReads,%BaseCountTypeRef);
103 my ($mapBase,$mapReads,$QBbase,$QBmis)=(0,0,0,0);
104 my $Qascii=33; # Sam 33, Soap 64.
105 my %Stat; # $Stat{Ref}{Cycle}{Read}{Quality}
106 sub statRead($$$$$) {
107 my ($ref,$isReverse,$read,$Qstr,$cyclestart)=@_;
108 if ($isReverse) {
109 $ref =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
110 $read =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
112 my $PEpos=-1;
113 my $QBflag=0;
114 for (my $i=0;$i<$READLEN;$i++) {
115 my $refBase=substr $ref,$i,1 or return;
116 next unless $refBase =~ /^[ATCG]$/;
117 my $readBase=substr $read,$i,1;
118 next if $readBase eq 'N';
119 my $QstrSingle=substr $Qstr,$i,1;
120 my $Qval=ord($QstrSingle)-$Qascii;
121 if ($MaxQ<$Qval) {
122 $MaxQ=$Qval;
123 warn "[!] Qval=$Qval($QstrSingle) > 40 found. Remember to add -I at bwa aln for Illumina reads !\n";
125 if ($isReverse) {
126 $PEpos=$cyclestart+$READLEN-1-$i;
127 } else {
128 $PEpos=$cyclestart+$i;
130 ++$Stat{$refBase}{$PEpos}{$readBase}{$Qval};
131 if ($Qval <= 2) {
132 $QBflag = 1;
133 ++$QBbase;
135 if ($refBase ne $readBase) {
136 ++$MisBase;
137 $QBmis += $QBflag;
139 ++$BaseCountTypeRef{$refBase};
140 ++$TotalBase;
141 #print "$isReverse {$refBase}{$PEpos}{$readBase}{$Qval} ",($refBase eq $readBase)?'=':'x',"\n";
143 ++$TotalReads unless $PEpos==-1;
146 my $type;
147 if ($opt_p eq 'sam') {
148 $type = 'sam';
149 } elsif ($opt_p eq 'soap') {
150 $type = 'soap';
151 $Qascii = 64;
152 } else {
153 chomp($_=<>) or die "[x]Empty input !\n";
154 if (/^@/) {
155 $type = 'sam';
156 } else {
157 $type = 'soap';
158 $Qascii = 64;
159 my @read1=split /\t/;
160 chomp($_=<>) or die "[x]Input too short !\n";
161 my @read2=split /\t/;
162 die "[x]Not PE soap file.\n" if $read1[0] ne $read2[0];
163 $mapBase += $read1[5]+$read2[5];
164 $mapReads +=2;
165 goto LABEL unless exists $Genome{$read1[7]};
166 goto LABEL unless $read1[3] == 1 and $read2[3] == 1; # Hit==1
167 goto LABEL if $read1[9] > 100 or $read2[9] > 100; # No Indel
168 goto LABEL unless $read1[5] == $READLEN;
169 goto LABEL unless $read2[5] == $READLEN;
170 goto LABEL unless $read1[7] eq $read2[7]; # same Reference sequence NAME
171 my $ref1=uc getBases($read1[7],$read1[8],$READLEN) or print join("\t",@read1),"\n";
172 my $ref2=uc getBases($read2[7],$read2[8],$READLEN) or print join("\t",@read2),"\n";
173 #my ($QNAME,$Seq,$Qual,$Hit,$a/b,$Len,$Strand,$Chr,$Pos,$Type,$SMID,$CIAGR,$Etc)=@read1;
174 # 0 1 2 3 4 5 6 7 8 9 10 11 12
175 statRead($ref1,$read1[6] eq '-',$read1[1],$read1[2],1);
176 statRead($ref2,$read2[6] eq '-',$read2[1],$read2[2],1+$READLEN);
179 LABEL:
180 print "[!]Input file type is [$type].\n";
182 #my ($RL1,$RL2)=(0,0);
183 if ($type eq 'sam') {
184 while (<>) {
185 next if /^@\w\w\t\w\w:/;
186 chomp;
187 my @read1=split /\t/;
188 chomp($_=<>) or last;
189 my @read2=split /\t/;
190 #print join("\t",@read1),"\n-",join("\t",@read2),"\n";
191 die "[x]Not PE sam file.\n" if $read1[0] ne $read2[0];
192 ++$mapReads if $read1[2] ne '*';
193 ++$mapReads if $read2[2] ne '*';
194 next unless exists $Genome{$read1[2]};
195 next unless ($read1[1] & 3) == 3; # paired + mapped in a proper pair
196 next if $read1[1] >= 256; # not primary || QC failure || optical or PCR duplicate
197 next unless ($read2[1] & 3) == 3;
198 next if $read2[1] >= 256;
199 #$RL1=length($read1[9]);
200 #$RL2=length($read2[9]);
201 next unless $read1[5] =~ /^(\d+)M$/;
202 next unless $1 == $READLEN;
203 next unless $read2[5] =~ /^(\d+)M$/;
204 next unless $1 == $READLEN;
205 next unless $read1[6] eq '='; # same Reference sequence NAME
206 next unless $read2[6] eq '=';
207 next if $read1[11] eq 'XT:A:R'; # Type: Unique/Repeat/N/Mate-sw, N not found.
208 next if $read2[11] eq 'XT:A:R';
209 my $ref1=uc getBases($read1[2],$read1[3],$READLEN) or print join("\t",@read1),"\n";
210 my $ref2=uc getBases($read2[2],$read2[3],$READLEN) or print join("\t",@read2),"\n";
211 #my ($QNAME,$FLAG,$RNAME,$POS,$MAPQ,$CIAGR,$MRNM,$MPOS,$ISIZE,$SEQ,$QUAL,$OPT)=@read1;
212 # 0 1 2 3 4 5 6 7 8 9 10 11
213 statRead($ref1,$read1[1] & 16,$read1[9],$read1[10],1);
214 statRead($ref2,$read2[1] & 16,$read2[9],$read2[10],1+$READLEN);
216 } else {
217 while (<>) {
218 #next if /^@\w\w\t\w\w:/;
219 chomp;
220 my @read1=split /\t/;
221 chomp($_=<>) or last;
222 my @read2=split /\t/;
223 #print join("\t",@read1),"\n-",join("\t",@read2),"\n";
224 die "[x]Not PE soap file.\n" if $read1[0] ne $read2[0];
225 $mapBase += $read1[5]+$read2[5];
226 $mapReads +=2;
227 next unless exists $Genome{$read1[7]};
228 next unless $read1[3] == 1 and $read2[3] == 1; # Hit==1
229 next if $read1[9] > 100 or $read2[9] > 100; # No Indel
230 next unless $read1[5] == $READLEN;
231 next unless $read2[5] == $READLEN;
232 next unless $read1[7] eq $read2[7]; # same Reference sequence NAME
233 my $ref1=uc getBases($read1[7],$read1[8],$READLEN) or print join("\t",@read1),"\n";
234 my $ref2=uc getBases($read2[7],$read2[8],$READLEN) or print join("\t",@read2),"\n";
235 #my ($QNAME,$Seq,$Qual,$Hit,$a/b,$Len,$Strand,$Chr,$Pos,$Type,$SMID,$CIAGR,$Etc)=@read1;
236 # 0 1 2 3 4 5 6 7 8 9 10 11 12
237 statRead($ref1,$read1[6] eq '-',$read1[1],$read1[2],1);
238 statRead($ref2,$read2[6] eq '-',$read2[1],$read2[2],1+$READLEN);
242 open OA,'>',$opt_o.'.count.matrix' or die "Error: $!\n";
243 open OB,'>',$opt_o.'.ratio.matrix' or die "Error: $!\n";
244 my $tmp;
245 chomp(my $user=`id -nru`);
246 @ARGV=('/etc/passwd');
247 chomp(my @passwd=<>);
248 ($_)=grep /$user/,@passwd;
249 $tmp=(split /:/)[4];
250 my $mail='';
251 $mail=" <$tmp>" unless $tmp =~ /^\s*$/;
252 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
253 my $date=sprintf "%02d:%02d:%02d,%4d-%02d-%02d",$hour,$min,$sec,$year+1900,$mon+1,$mday;
254 my $Cycle=2*$READLEN;
255 my $Qcount=$MaxQ+1;
256 $TotalBase=-1 unless $TotalBase;
257 my $MisRate=100*$MisBase/$TotalBase;
258 $tmp="#Generate @ $date by ${user}$mail
259 #Input [$type] file of mapped Reads: $mapReads , mapped Bases $mapBase (no base stat for sam files)
260 #Total statistical Bases: $TotalBase , Reads: $TotalReads of ReadLength $READLEN
261 #Dimensions: Ref_base_number 4, Cycle_number $Cycle, Seq_base_number 4, Quality_number $Qcount
262 #Mismatch_base: $MisBase, Mismatch_rate: $MisRate %
263 #QB_Bases: $QBbase, QB_Mismatches: $QBmis (bases with quality <= 2)
264 #Reference Base Ratio in reads: ";
265 my @BaseOrder=sort qw{A T C G}; # keys %BaseCountTypeRef;
266 for (@BaseOrder) {
267 $tmp .= $_.' '. int(0.5+100*1000*$BaseCountTypeRef{$_}/$TotalBase)/1000 .' %; ';
269 my @BaseQ;
270 for my $base (@BaseOrder) {
271 push @BaseQ,"$base-$_" for (0..$MaxQ);
273 $tmp .= "\n#".join("\t",'Ref','Cycle',@BaseQ);
274 print OA $tmp;
275 print OB $tmp;
276 print OA "\tRowSum";
277 print OB "\n";
278 my ($count,$countsum);
279 for my $ref (@BaseOrder) {
280 print OA "\n";
281 for my $cycle (1..(2*$READLEN)) {
282 $tmp="$ref\t$cycle\t";
283 print OA $tmp; print OB $tmp;
284 my (@Counts,@Rates)=();
285 for my $base (@BaseOrder) {
286 for my $q (0..$MaxQ) {
287 if (exists $Stat{$ref}{$cycle} and exists $Stat{$ref}{$cycle}{$base} and exists $Stat{$ref}{$cycle}{$base}{$q}) {
288 $count=$Stat{$ref}{$cycle}{$base}{$q};
289 } else {$count=0;}
290 push @Counts,$count;
293 $countsum=0;
294 $countsum += $_ for @Counts;
295 $countsum=-1 if $countsum==0;
296 push @Rates,$_/$countsum for @Counts;
297 print OA join("\t",@Counts,$countsum),"\n";
298 print OB join("\t",@Rates),"\n";
301 close OA;
302 close OB;
303 #END
304 my $stop_time = [gettimeofday];
306 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
307 __END__
308 zcat bwamask/mask110621_I263_FCB066DABXX_L8_HUMjrmRACDKAAPEI-3.sam.gz|head -n200|./matrixsam.pl -b 2>&1 |tee logerr.txt
310 3289m for Hg18 reference.
312 0.192234941 ms per line.
313 Thus, for a LANE of 216009064 lines, which is (108004507 sequences)x2+50, 11.5345804648626 hours needed.