new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / ifollowQ.pl
blob55bb67fac82bc6bb97bcf25384e80dab52d633dd
1 #!/bin/env perl
2 use strict;
3 use warnings;
5 die "Usage: $0 <LENtoStat> <read1.fq.gz> <outprefix>.fQ{out,dat}\n" if @ARGV != 3;
6 my ($LENtoStat,$fq1,$out)=@ARGV;
7 $out .= ".$LENtoStat";
9 sub openfile($) {
10 my ($filename)=@_;
11 my $infile;
12 if ($filename=~/.bz2$/) {
13 open( $infile,"-|","bzip2 -dc $filename") or die "Error opening $filename: $!\n";
14 } elsif ($filename=~/.gz$/) {
15 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
16 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
17 return $infile;
19 sub readfq($) {
20 my $fh=$_[0];
21 defined(my $a=<$fh>) or return [1];
22 chomp($a);
23 chomp(my $b=<$fh>) or return [2];
24 chomp(my $c=<$fh>) or return [3];
25 chomp(my $d=<$fh>) or return [4];
26 #print "$b\n$d\n";
27 $b=substr($b,0,length($d)) if $d =~ s/B+$//;
28 #print "$a\n$b\n$c\n$d\n---\n";
29 return [] unless length($d);
30 return [$a,$b,$c,$d];
32 sub getQ($) {
33 my @Qstr=split //,$_[0];
34 my @Qvalue=();
35 push @Qvalue,ord($_)-64 for @Qstr;
36 return \@Qvalue;
38 sub cal($) {
39 my $Qhashes=$_[0];
40 my ($x,$xx,$n,$cnt)=(0,0,0);
41 my ($max,$min,$common,$maxcnt)=(0,(keys %$Qhashes)[0],0,0);
42 for my $k (keys %$Qhashes) {
43 $cnt = $$Qhashes{$k};
44 $x += $k * $cnt;
45 $xx += $k*$k * $cnt;
46 $n += $cnt;
47 $max = $k if $max < $k;
48 $min = $k if $min > $k;
49 if ($maxcnt<$cnt) {
50 $maxcnt = $cnt;
51 $common = $k;
54 if ($n<2) {
55 return [$n,0,0] if $n<1;
56 return [$n,$max,0] if $n==1;
58 my $mean=$x/$n;
59 my $std=($xx-$x*$mean)/$n-1;
60 if ($std>0) {
61 $std=sqrt($std);
62 } elsif ($std==-1) {
63 $std='i';
64 } else {
65 $std="sqrt($std)";
67 return [$n,$max,$min,$common,$mean,$std];
70 my ($ReadCnt,%statQ,$ret)=(0);
72 #my $LENtoStat=10;
73 sub statQ($) {
74 my $Qvalues=$_[0];
75 my $Qlen=scalar @$Qvalues;
76 return "[x]Read Length must >= $LENtoStat." if $Qlen<$LENtoStat;
77 for my $p (0..$Qlen-$LENtoStat-1) {
78 # for my $p (19..$Qlen-$LENtoStat-1) {
79 for my $q ($p+1..$p+$LENtoStat) {
80 ++$statQ{$$Qvalues[$p]}{$$Qvalues[$q]};
84 my $FQa=openfile($fq1);
85 #my $FQb=openfile($fq2);
87 my $fqitem=&readfq($FQa);
88 while (@$fqitem != 1) {
89 if (@$fqitem == 0) {
90 $fqitem=&readfq($FQa);
91 next;
93 my ($id,$Q)=@$fqitem[0,3];
94 my $Qvalues=&getQ($Q);
95 &statQ($Qvalues);
96 $fqitem=&readfq($FQa);
97 ++$ReadCnt;
99 close $FQa;
101 open OUT,'>',$out.'.fQout' or die "Error opening $out.fQout: $!\n";
102 open OD,'>',$out.'.fQdat' or die "Error opening $out.fQdat: $!\n";
103 print OUT "#ReadsCnt=$ReadCnt LENtoStat=$LENtoStat\n#",join("\t",qw/ Q cnt max min common mean std /),"\n";
104 print OD "#ReadsCnt=$ReadCnt LENtoStat=$LENtoStat\n#Q\toutMean\t",join("\t",(2..40)),"\n";
105 my ($above,$below,$at)=(0,0,0);
106 for my $k (sort {$a<=>$b} keys %statQ) {
107 $ret=&cal($statQ{$k});
108 print OUT join("\t",$k,@$ret),"\n";
109 print OD "$k\t$$ret[-2]";
110 for my $oq (2..40) {
111 my $out='-';
112 if (exists $statQ{$k}{$oq}){# && $k>2 && $oq>2) {
113 $out=$statQ{$k}{$oq};
114 if ($oq>$k) {$above+=$out;}# if $k>2;}
115 elsif ($oq==$k) {$at+=$out;}
116 else {$below+=$out;}
118 print OD "\t$out";
120 print OD "\n";
122 #print OUT "Y\t$_\t$Yrange{$_}\n" for sort {$a<=>$b} keys %Yrange;
123 close OUT;
124 print OD "# Above: $above\n# At: $at\n# Below: $below\n";
125 close OD;