released/pIRS.old/bwasam/matrixstat.pl

   1 #!/bin/env perl
   2 =pod
   3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
   4 Version: 0.1.1 @ 20110819
   5 =cut
   6 #use lib "/ifs1/ST_ASMB/USER/huxuesong/public/lib";
   7 #use lib '/export/data0/gentoo/tmp';
   8 use strict;
   9 use warnings;
  10 use Time::HiRes qw ( gettimeofday tv_interval );
  11 use Galaxy::ShowHelp;
  12
  13 $main::VERSION=0.1.1;
  14 our $opts='r:o:l:p:s:c:b';
  15 our($opt_o, $opt_r, $opt_l, $opt_p, $opt_s, $opt_c, $opt_b);
  16
  17 #our $desc='';
  18 our $help=<<EOH;
  19 \t-p type of input files {(auto),sam,soap}
  20 \t-r ref fasta file (./ref/human.fa) [.{gz,bz2} is OK]
  21 \t-s trim SNP positions from (<filename>) in format /^ChrID\\tPos/
  22 \t-l read length of reads (100)
  23 \t-o output prefix (./matrix).{count,ratio}.matrix
  24 \t-c ChrID list (./chrtouse)
  25 \t-b No pause for batch runs
  26 For gzipped files, use zcat and pipe(|).
  27 EOH
  28 our $ARG_DESC='{sam,soap}pe_files';
  29
  30 ShowHelp();
  31 $opt_r='./ref/human.fa' if ! $opt_r;
  32 $opt_p='auto' if ! $opt_p;
  33 $opt_o='./matrix' if ! $opt_o;
  34 #$opt_c='./chrtouse' if ! $opt_c;
  35 $opt_l=100 if ! $opt_l;
  36 die "[x]-r $opt_r not exists !\n" unless -f $opt_r;
  37 if ($opt_s) {die "[x]-s $opt_s not exists !\n" unless -f $opt_s;}
  38
  39 print STDERR "From [@ARGV]($opt_p) of [$opt_l] with [$opt_r][$opt_s] to [$opt_o]\n";
  40 print STDERR "ChrID list:[$opt_c]\n" if $opt_c;
  41 unless ($opt_b) {print STDERR "Wait 3 seconds to continue...\n"; sleep 3;}
  42
  43 my $start_time = [gettimeofday];
  44 #BEGIN
  45
  46 my %Genome;
  47 if ($opt_c) {
  48     open C,'<',$opt_c or die "Error: $!\n";
  49     while(<C>){
  50         chomp;
  51         ++$Genome{$_};
  52     }
  53     close C;
  54 }
  55 warn "[!]Reading Reference Genome:\n";
  56 if ($opt_r =~ /.bz2$/) {
  57     open( GENOME,"-|","bzip2 -dc $opt_r") or die "Error opening $opt_r: $!\n";
  58 } elsif ($opt_r =~ /.gz$/) {
  59         open( GENOME,"-|","gzip -dc $opt_r") or die "Error opening $opt_r: $!\n";
  60 } else {open( GENOME,"<",$opt_r) or die "Error opening $opt_r: $!\n";}
  61 #open GENOME,'<',$opt_r or die "Error: $!\n";
  62 while (<GENOME>) {
  63     s/^>//;
  64         /^(\S+)/ or next;
  65         my $seqname = $1;
  66     print STDERR " >$seqname ...";
  67         $/=">";
  68         my $genome=<GENOME>;
  69         chomp $genome;
  70         $genome=~s/\s//g;
  71         $/="\n";
  72         if ((!$opt_c) or exists $Genome{$seqname}) {
  73         $Genome{$seqname}=$genome;
  74         print STDERR "\b\b\b",length $Genome{$seqname},".\n";
  75     } else {print STDERR "\b\b\b",length $genome,", skipped.\n";}
  76         $genome='';
  77 }
  78 close GENOME;
  79 if ($opt_s) {
  80     print STDERR "[!]Reading SNP: ";
  81     open SNP,'<',$opt_s or die "Error: $!\n";
  82     while(<SNP>) {
  83         chomp;
  84         my ($chr,$pos)=split /\s+/;
  85         substr $Genome{$chr},$pos-1,1,'x' if exists $Genome{$chr};
  86     }
  87     close SNP;
  88     print STDERR "done.\n";
  89 }
  90 ###
  91 #print ">$_\n$Genome{$_}\n\n" for sort keys %Genome;
  92 ###
  93 sub getBases($$$) {
  94     my ($chr,$start,$len)=@_;
  95     return substr $Genome{$chr},$start-1,$len;
  96 }
  97
  98 my $READLEN=$opt_l;
  99 my $MaxQ=40;
 100 my $MisBase=0;
 101
 102 my ($TotalBase,$TotalReads,%BaseCountTypeRef);
 103 my ($mapBase,$mapReads,$QBbase,$QBmis)=(0,0,0,0);
 104 my $Qascii=33;  # Sam 33, Soap 64.
 105 my %Stat;   # $Stat{Ref}{Cycle}{Read}{Quality}
 106 sub statRead($$$$$) {
 107     my ($ref,$isReverse,$read,$Qstr,$cyclestart)=@_;
 108     if ($isReverse) {
 109         $ref =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 110         $read =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 111     }
 112     my $PEpos=-1;
 113     my $QBflag=0;
 114     for (my $i=0;$i<$READLEN;$i++) {
 115         my $refBase=substr $ref,$i,1 or return;
 116         next unless $refBase =~ /^[ATCG]$/;
 117         my $readBase=substr $read,$i,1;
 118         next if $readBase eq 'N';
 119         my $QstrSingle=substr $Qstr,$i,1;
 120         my $Qval=ord($QstrSingle)-$Qascii;
 121         if ($MaxQ<$Qval) {
 122             $MaxQ=$Qval;
 123             warn "[!] Qval=$Qval($QstrSingle) > 40 found. Remember to add -I at bwa aln for Illumina reads !\n";
 124         }
 125         if ($isReverse) {
 126             $PEpos=$cyclestart+$READLEN-1-$i;
 127         } else {
 128             $PEpos=$cyclestart+$i;
 129         }
 130         ++$Stat{$refBase}{$PEpos}{$readBase}{$Qval};
 131         if ($Qval <= 2) {
 132             $QBflag = 1;
 133             ++$QBbase;
 134         }
 135         if ($refBase ne $readBase) {
 136             ++$MisBase;
 137             $QBmis += $QBflag;
 138         }
 139         ++$BaseCountTypeRef{$refBase};
 140         ++$TotalBase;
 141 #print "$isReverse {$refBase}{$PEpos}{$readBase}{$Qval} ",($refBase eq $readBase)?'=':'x',"\n";
 142     }
 143     ++$TotalReads unless $PEpos==-1;
 144 }
 145
 146 my $type;
 147 if ($opt_p eq 'sam') {
 148     $type = 'sam';
 149 } elsif ($opt_p eq 'soap') {
 150     $type = 'soap';
 151     $Qascii = 64;
 152 } else {
 153     chomp($_=<>) or die "[x]Empty input !\n";
 154     if (/^@/) {
 155         $type = 'sam';
 156     } else {
 157         $type = 'soap';
 158         $Qascii = 64;
 159         my @read1=split /\t/;
 160         chomp($_=<>) or die "[x]Input too short !\n";
 161         my @read2=split /\t/;
 162         die "[x]Not PE soap file.\n" if $read1[0] ne $read2[0];
 163         $mapBase += $read1[5]+$read2[5];
 164         $mapReads +=2;
 165         goto LABEL unless exists $Genome{$read1[7]};
 166         goto LABEL unless $read1[3] == 1 and $read2[3] == 1;  # Hit==1
 167         goto LABEL if $read1[9] > 100 or $read2[9] > 100; # No Indel
 168         goto LABEL unless $read1[5] == $READLEN;
 169         goto LABEL unless $read2[5] == $READLEN;
 170         goto LABEL unless $read1[7] eq $read2[7];   # same Reference sequence NAME
 171         my $ref1=uc getBases($read1[7],$read1[8],$READLEN) or print join("\t",@read1),"\n";
 172         my $ref2=uc getBases($read2[7],$read2[8],$READLEN) or print join("\t",@read2),"\n";
 173         #my ($QNAME,$Seq,$Qual,$Hit,$a/b,$Len,$Strand,$Chr,$Pos,$Type,$SMID,$CIAGR,$Etc)=@read1;
 174         #       0     1    2     3    4    5     6      7    8    9     10     11   12
 175         statRead($ref1,$read1[6] eq '-',$read1[1],$read1[2],1);
 176         statRead($ref2,$read2[6] eq '-',$read2[1],$read2[2],1+$READLEN);
 177     }
 178 }
 179 LABEL:
 180 print "[!]Input file type is [$type].\n";
 181
 182 #my ($RL1,$RL2)=(0,0);
 183 if ($type eq 'sam') {
 184     while (<>) {
 185         next if /^@\w\w\t\w\w:/;
 186         chomp;
 187         my @read1=split /\t/;
 188         chomp($_=<>) or last;
 189         my @read2=split /\t/;
 190     #print join("\t",@read1),"\n-",join("\t",@read2),"\n";
 191         die "[x]Not PE sam file.\n" if $read1[0] ne $read2[0];
 192         ++$mapReads if $read1[2] ne '*';
 193         ++$mapReads if $read2[2] ne '*';
 194         next unless exists $Genome{$read1[2]};
 195         next unless ($read1[1] & 3) == 3;  # paired + mapped in a proper pair
 196         next if $read1[1] >= 256;   # not primary || QC failure || optical or PCR duplicate
 197         next unless ($read2[1] & 3) == 3;
 198         next if $read2[1] >= 256;
 199         #$RL1=length($read1[9]);
 200         #$RL2=length($read2[9]);
 201         next unless $read1[5] =~ /^(\d+)M$/;
 202         next unless $1 == $READLEN;
 203         next unless $read2[5] =~ /^(\d+)M$/;
 204         next unless $1 == $READLEN;
 205         next unless $read1[6] eq '=';   # same Reference sequence NAME
 206         next unless $read2[6] eq '=';
 207         next if $read1[11] eq 'XT:A:R'; # Type: Unique/Repeat/N/Mate-sw, N not found.
 208         next if $read2[11] eq 'XT:A:R';
 209         my $ref1=uc getBases($read1[2],$read1[3],$READLEN) or print join("\t",@read1),"\n";
 210         my $ref2=uc getBases($read2[2],$read2[3],$READLEN) or print join("\t",@read2),"\n";
 211         #my ($QNAME,$FLAG,$RNAME,$POS,$MAPQ,$CIAGR,$MRNM,$MPOS,$ISIZE,$SEQ,$QUAL,$OPT)=@read1;
 212         #       0      1    2       3   4       5   6       7     8     9    10    11
 213         statRead($ref1,$read1[1] & 16,$read1[9],$read1[10],1);
 214         statRead($ref2,$read2[1] & 16,$read2[9],$read2[10],1+$READLEN);
 215     }
 216 } else {
 217     while (<>) {
 218         #next if /^@\w\w\t\w\w:/;
 219         chomp;
 220         my @read1=split /\t/;
 221         chomp($_=<>) or last;
 222         my @read2=split /\t/;
 223     #print join("\t",@read1),"\n-",join("\t",@read2),"\n";
 224         die "[x]Not PE soap file.\n" if $read1[0] ne $read2[0];
 225         $mapBase += $read1[5]+$read2[5];
 226         $mapReads +=2;
 227         next unless exists $Genome{$read1[7]};
 228         next unless $read1[3] == 1 and $read2[3] == 1;  # Hit==1
 229         next if $read1[9] > 100 or $read2[9] > 100; # No Indel
 230         next unless $read1[5] == $READLEN;
 231         next unless $read2[5] == $READLEN;
 232         next unless $read1[7] eq $read2[7];   # same Reference sequence NAME
 233         my $ref1=uc getBases($read1[7],$read1[8],$READLEN) or print join("\t",@read1),"\n";
 234         my $ref2=uc getBases($read2[7],$read2[8],$READLEN) or print join("\t",@read2),"\n";
 235         #my ($QNAME,$Seq,$Qual,$Hit,$a/b,$Len,$Strand,$Chr,$Pos,$Type,$SMID,$CIAGR,$Etc)=@read1;
 236         #       0     1    2     3    4    5     6      7    8    9     10     11   12
 237         statRead($ref1,$read1[6] eq '-',$read1[1],$read1[2],1);
 238         statRead($ref2,$read2[6] eq '-',$read2[1],$read2[2],1+$READLEN);
 239     }
 240 }
 241
 242 open OA,'>',$opt_o.'.count.matrix' or die "Error: $!\n";
 243 open OB,'>',$opt_o.'.ratio.matrix' or die "Error: $!\n";
 244 my $tmp;
 245 chomp(my $user=`id -nru`);
 246 @ARGV=('/etc/passwd');
 247 chomp(my @passwd=<>);
 248 ($_)=grep /$user/,@passwd;
 249 $tmp=(split /:/)[4];
 250 my $mail='';
 251 $mail=" <$tmp>" unless $tmp =~ /^\s*$/;
 252 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
 253 my $date=sprintf "%02d:%02d:%02d,%4d-%02d-%02d",$hour,$min,$sec,$year+1900,$mon+1,$mday;
 254 my $Cycle=2*$READLEN;
 255 my $Qcount=$MaxQ+1;
 256 $TotalBase=-1 unless $TotalBase;
 257 my $MisRate=100*$MisBase/$TotalBase;
 258 $tmp="#Generate @ $date by ${user}$mail
 259 #Input [$type] file of mapped Reads: $mapReads , mapped Bases $mapBase (no base stat for sam files)
 260 #Total statistical Bases: $TotalBase , Reads: $TotalReads of ReadLength $READLEN
 261 #Dimensions: Ref_base_number 4, Cycle_number $Cycle, Seq_base_number 4, Quality_number $Qcount
 262 #Mismatch_base: $MisBase, Mismatch_rate: $MisRate %
 263 #QB_Bases: $QBbase, QB_Mismatches: $QBmis (bases with quality <= 2)
 264 #Reference Base Ratio in reads: ";
 265 my @BaseOrder=sort qw{A T C G}; # keys %BaseCountTypeRef;
 266 for (@BaseOrder) {
 267     $tmp .= $_.' '. int(0.5+100*1000*$BaseCountTypeRef{$_}/$TotalBase)/1000 .' %;   ';
 268 }
 269 my @BaseQ;
 270 for my $base (@BaseOrder) {
 271     push @BaseQ,"$base-$_" for (0..$MaxQ);
 272 }
 273 $tmp .= "\n#".join("\t",'Ref','Cycle',@BaseQ);
 274 print OA $tmp;
 275 print OB $tmp;
 276 print OA "\tRowSum";
 277 print OB "\n";
 278 my ($count,$countsum);
 279 for my $ref (@BaseOrder) {
 280     print OA "\n";
 281     for my $cycle (1..(2*$READLEN)) {
 282         $tmp="$ref\t$cycle\t";
 283         print OA $tmp; print OB $tmp;
 284         my (@Counts,@Rates)=();
 285         for my $base (@BaseOrder) {
 286             for my $q (0..$MaxQ) {
 287                 if (exists $Stat{$ref}{$cycle} and exists $Stat{$ref}{$cycle}{$base} and exists $Stat{$ref}{$cycle}{$base}{$q}) {
 288                     $count=$Stat{$ref}{$cycle}{$base}{$q};
 289                 } else {$count=0;}
 290                 push @Counts,$count;
 291             }
 292         }
 293         $countsum=0;
 294         $countsum += $_ for @Counts;
 295                 $countsum=-1 if $countsum==0;
 296         push @Rates,$_/$countsum for @Counts;
 297         print OA join("\t",@Counts,$countsum),"\n";
 298         print OB join("\t",@Rates),"\n";
 299     }
 300 }
 301 close OA;
 302 close OB;
 303 #END
 304 my $stop_time = [gettimeofday];
 305
 306 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
 307 __END__
 308 zcat bwamask/mask110621_I263_FCB066DABXX_L8_HUMjrmRACDKAAPEI-3.sam.gz|head -n200|./matrixsam.pl -b 2>&1 |tee logerr.txt
 309
 310 3289m for Hg18 reference.
 311
 312 0.192234941 ms per line.
 313 Thus, for a LANE of 216009064 lines, which is (108004507 sequences)x2+50, 11.5345804648626 hours needed.
 314