new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / cmpBfq.pl
blob97f45442b9ff3b0ab4b8a07f784be56f1e24c658
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 use Galaxy::IO::FASTAQ;
5 use Galaxy::IO;
6 use Galaxy::Casava::Eamss qw(doEamss);
8 $SIG{INT} = \&stopnow;
10 die "Usage: $0 <ori-fq> <B-masked-fq> [outprefix].(log|stat)\n" if @ARGV <2;
11 my ($inA,$inB,$out)=@ARGV;
12 $out=$inA unless $out;
13 warn "From [$inA]&[$inB] to [$out].(log|stat)\n";
15 open LOG,'>',"${out}.log" or die "Error opening $out.log:$!\n";
16 open STAT,'>',"${out}.stat" or die "Error opening $out.stat:$!\n";
18 my $fha=openfile($inA);
19 my $fhb=openfile($inB);
21 #my @aux1 = undef;
22 #my @aux2 = undef;
23 my (@aux1,@aux2);
24 my ($Count1,$Count2,$CountPairs)=(0,0,0);
26 sub doCMP($$$);
28 my ($dat1,$dat2);
29 my ($maskedQ,$mlen,$rlen,$t,$ret,%Count,%MaskLen,%Readlen,%ReadType);
30 my ($totBase,$totMaskedBP,$totMaskedReads)=(0,0,0);
31 my ($totBaseC,$totMaskedBPC,$totMaskedReadsC)=(0,0,0);
32 while (1) {
33 $dat1 = readfq($fha, \@aux1); # [$name, $comment, $seq, $qual]
34 $dat2 = readfq($fhb, \@aux2);
35 if ($dat1 && $dat2) {
36 ($maskedQ,$mlen) = @{doEamss($$dat1[2],$$dat1[3])};
37 if ($$dat2[1] =~ /(\d):([YN]):\d+:([ATCGN]+)?/) { # 1:N:0:TGACCA
38 $t = ($1 eq '2')? 1 : 0;
39 $t |= 2 if ($2 eq 'Y');
40 $t |= 4 if $3;
41 $t |= 8 if $3 =~ /N/;
42 #++$ReadType{$t}; # 1:Read2, 2:Casava-filtered, 3:with-index
44 #print "3-",join('|',$$dat1[0],$$dat1[1],$mlen,$t),"\n$$dat1[2]\n$$dat1[3]\n$maskedQ\n";
45 RECMP:
46 $ret = doCMP($dat1,$dat2,$maskedQ);
47 if ($ret & 1) {
48 ++$ReadType{$t|16};
49 $dat2 = readfq($fhb, \@aux2);
50 goto RECMP;
52 # due to a mistake upstream in the masked FASTQ file ...
53 if ( ($ret&3) == 2 ) {
54 print STAT "1-",join('|',@$dat1),"\n2-",join('|',@$dat2),"\n";
56 ++$ReadType{$t};
57 ++$Count{$ret};
58 ++$MaskLen{$mlen}->[0];
59 $rlen = length $$dat1[2];
60 ++$Readlen{$rlen};
61 ++$CountPairs;
62 $totBase += $rlen;
63 if ($mlen) {
64 ++$totMaskedReads;
65 $totMaskedBP += $mlen;
66 unless ($t & 2) {
67 $totMaskedBPC += $mlen;
68 ++$totMaskedReadsC;
71 unless ($t & 2) {
72 $totBaseC += $rlen;
73 ++$MaskLen{$mlen}->[1];
75 } elsif ($dat1) {
76 ++$Count1;
77 print "1-",join('|',@$dat1),"\n";
78 } elsif ($dat2) {
79 ++$Count2;
80 print "2-",join('|',@$dat2),"\n";
81 } else {
82 last;
85 close $fha;
86 close $fhb;
88 my $str = "Out Pairs: $CountPairs\nFQ1 over hang: $Count1\nFQ2 over hang: $Count2\n";
89 print $str;
90 print LOG "From [$inA]&[$inB] to [$out.stat]\n$str";
92 $totMaskedReads = -1 unless $totMaskedReads;
93 $t = int(0.5+10*$totMaskedBP/$totMaskedReads)/10;
94 $str = "\nTotal Reads, Base: $CountPairs, $totBase\tMasked Reads, Base: $totMaskedReads, $totMaskedBP\tAvg Masked Length: $t\n";
95 $totMaskedReadsC = -1 unless $totMaskedReadsC;
96 $t = int(0.5+10*$totMaskedBPC/$totMaskedReadsC)/10;
97 $str .= "After Casava-filter:\nTotal Base: $totBaseC\tMasked Reads, Base: $totMaskedReadsC, $totMaskedBPC\tAvg Masked Length: $t\n\n";
98 print $str;
99 print LOG $str,"@ ",join("\t",$totBaseC,$totMaskedBPC,$totBase,$totMaskedBP,$inA),"\n";
101 $str="[Compared] (0 or 8 is OK, diff for id,comment,seq,q,eamss)\n";
102 for my $k (sort {$a <=> $b} keys %Count) {
103 $str .= sprintf("%#06b\t(%#x)\t%d\n",$k,$k,$Count{$k});
105 print $str;
106 print LOG $str;
108 $str="\n[Read_Type] (1:Read2, 2:Casava-filtered, 3:with-index, 4:index-have-N, 5:masked-with-more-reads)\n";
109 for my $k (sort {$a <=> $b} keys %ReadType) {
110 $str .= sprintf("%#06b\t(%#x)\t%d\n",$k,$k,$ReadType{$k});
112 print $str;
113 print LOG $str;
115 $str="\n[Read_Length]\n";
116 for my $k (sort {$a <=> $b} keys %Readlen) {
117 $str .= sprintf("%d\t%d\n",$k,$Readlen{$k});
119 #print $str;
120 print LOG $str;
122 $str="\n[Masked_Length] (all, After_Casava-filter)\n";
123 for my $k (sort {$a <=> $b} keys %MaskLen) {
124 $MaskLen{$k}->[0] = 0 unless $MaskLen{$k}->[0];
125 $MaskLen{$k}->[1] = 0 unless $MaskLen{$k}->[1];
126 $str .= sprintf("%d\t%d\t%d\n",$k,$MaskLen{$k}->[0],$MaskLen{$k}->[1]);
128 #print $str;
129 print STAT $str;
131 close LOG;
132 close STAT;
135 sub doCMP($$$) {
136 my ($dat1,$dat2,$maskedQ) = @_;
137 my $flag = 0;
138 for my $i (0 .. 3) {
139 $flag |= 1<<$i if $$dat1[$i] ne $$dat2[$i];
141 $flag |= 1<<4 if $maskedQ ne $$dat2[3];
142 return $flag; # id,comment,seq,q,eamss
145 sub stopnow() {
146 $SIG{INT} = \&stopnow;
147 warn "\nCtrl+C detected, stop and saving ...\n";
148 print LOG "Ctrl+C detected. Stopped in mid-way.\n";
149 $dat1 = $dat2 = undef;