4 use Galaxy
::IO
::FASTAQ
;
6 use Galaxy
::Casava
::Eamss
qw(doEamss);
10 die "Usage: $0 <ori-fq> <B-masked-fq> [outprefix].(log|stat)\n" if @ARGV <2;
11 my ($inA,$inB,$out)=@ARGV;
12 $out=$inA unless $out;
13 warn "From [$inA]&[$inB] to [$out].(log|stat)\n";
15 open LOG
,'>',"${out}.log" or die "Error opening $out.log:$!\n";
16 open STAT
,'>',"${out}.stat" or die "Error opening $out.stat:$!\n";
18 my $fha=openfile
($inA);
19 my $fhb=openfile
($inB);
24 my ($Count1,$Count2,$CountPairs)=(0,0,0);
29 my ($maskedQ,$mlen,$rlen,$t,$ret,%Count,%MaskLen,%Readlen,%ReadType);
30 my ($totBase,$totMaskedBP,$totMaskedReads)=(0,0,0);
31 my ($totBaseC,$totMaskedBPC,$totMaskedReadsC)=(0,0,0);
33 $dat1 = readfq
($fha, \
@aux1); # [$name, $comment, $seq, $qual]
34 $dat2 = readfq
($fhb, \
@aux2);
36 ($maskedQ,$mlen) = @
{doEamss
($$dat1[2],$$dat1[3])};
37 if ($$dat2[1] =~ /(\d):([YN]):\d+:([ATCGN]+)?/) { # 1:N:0:TGACCA
38 $t = ($1 eq '2')?
1 : 0;
39 $t |= 2 if ($2 eq 'Y');
42 #++$ReadType{$t}; # 1:Read2, 2:Casava-filtered, 3:with-index
44 #print "3-",join('|',$$dat1[0],$$dat1[1],$mlen,$t),"\n$$dat1[2]\n$$dat1[3]\n$maskedQ\n";
46 $ret = doCMP
($dat1,$dat2,$maskedQ);
49 $dat2 = readfq
($fhb, \
@aux2);
52 # due to a mistake upstream in the masked FASTQ file ...
53 if ( ($ret&3) == 2 ) {
54 print STAT
"1-",join('|',@
$dat1),"\n2-",join('|',@
$dat2),"\n";
58 ++$MaskLen{$mlen}->[0];
59 $rlen = length $$dat1[2];
65 $totMaskedBP += $mlen;
67 $totMaskedBPC += $mlen;
73 ++$MaskLen{$mlen}->[1];
77 print "1-",join('|',@
$dat1),"\n";
80 print "2-",join('|',@
$dat2),"\n";
88 my $str = "Out Pairs: $CountPairs\nFQ1 over hang: $Count1\nFQ2 over hang: $Count2\n";
90 print LOG
"From [$inA]&[$inB] to [$out.stat]\n$str";
92 $totMaskedReads = -1 unless $totMaskedReads;
93 $t = int(0.5+10*$totMaskedBP/$totMaskedReads)/10;
94 $str = "\nTotal Reads, Base: $CountPairs, $totBase\tMasked Reads, Base: $totMaskedReads, $totMaskedBP\tAvg Masked Length: $t\n";
95 $totMaskedReadsC = -1 unless $totMaskedReadsC;
96 $t = int(0.5+10*$totMaskedBPC/$totMaskedReadsC)/10;
97 $str .= "After Casava-filter:\nTotal Base: $totBaseC\tMasked Reads, Base: $totMaskedReadsC, $totMaskedBPC\tAvg Masked Length: $t\n\n";
99 print LOG
$str,"@ ",join("\t",$totBaseC,$totMaskedBPC,$totBase,$totMaskedBP,$inA),"\n";
101 $str="[Compared] (0 or 8 is OK, diff for id,comment,seq,q,eamss)\n";
102 for my $k (sort {$a <=> $b} keys %Count) {
103 $str .= sprintf("%#06b\t(%#x)\t%d\n",$k,$k,$Count{$k});
108 $str="\n[Read_Type] (1:Read2, 2:Casava-filtered, 3:with-index, 4:index-have-N, 5:masked-with-more-reads)\n";
109 for my $k (sort {$a <=> $b} keys %ReadType) {
110 $str .= sprintf("%#06b\t(%#x)\t%d\n",$k,$k,$ReadType{$k});
115 $str="\n[Read_Length]\n";
116 for my $k (sort {$a <=> $b} keys %Readlen) {
117 $str .= sprintf("%d\t%d\n",$k,$Readlen{$k});
122 $str="\n[Masked_Length] (all, After_Casava-filter)\n";
123 for my $k (sort {$a <=> $b} keys %MaskLen) {
124 $MaskLen{$k}->[0] = 0 unless $MaskLen{$k}->[0];
125 $MaskLen{$k}->[1] = 0 unless $MaskLen{$k}->[1];
126 $str .= sprintf("%d\t%d\t%d\n",$k,$MaskLen{$k}->[0],$MaskLen{$k}->[1]);
136 my ($dat1,$dat2,$maskedQ) = @_;
139 $flag |= 1<<$i if $$dat1[$i] ne $$dat2[$i];
141 $flag |= 1<<4 if $maskedQ ne $$dat2[3];
142 return $flag; # id,comment,seq,q,eamss
146 $SIG{INT
} = \
&stopnow
;
147 warn "\nCtrl+C detected, stop and saving ...\n";
148 print LOG
"Ctrl+C detected. Stopped in mid-way.\n";
149 $dat1 = $dat2 = undef;