3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120627
8 #use Data::Dump qw(ddx);
9 use Galaxy
::IO
::FASTAQ
qw(readfq getQvaluesFQ);
12 die "Usage: $0 <tag list> <6 num for (N,Cross,nonCross) by (bar,ec)> <fq1> <fq2> <out prefix>\n" if @ARGV<5;
14 my @maxMis=split //,shift;
20 my $EseqLen = length $EseqR;
22 my (%BarSeq2idn,@BarSeq,@BarSeqs);
23 open L
,'<',$tagf or die "Error opening $tagf:$!\n";
26 my ($seq,$id,$name)=split /\t/;
31 push @BarSeqs,[split //,$seq];
32 $BarSeq2idn{$seq} = [$id,$name,0,0,0,0]; # [id name outfh1 outfh2 PairsCnt PE_BaseCnt]
34 $BarSeq2idn{'N'} = ['NA','Unknown',0,0,0,0];
35 my $BARLEN = length $BarSeq[0];
38 die if $BARLEN != length $BarSeq[-1];
39 my $EseqAfter = $BARLEN - $EseqLen -1; # starts from 0
41 die "Max Barcode or EC length are 9 !\n" if $BARLEN > 18 or $EseqLen > 9;
43 open LOG
,'>',"${outp}.log" or die "Error opening $outp.log:$!\n";
44 print LOG
"From [$fq1f]&[$fq2f] with [$tagf][",join(',',@maxMis),"] to [$outp.*]\n";
46 for my $k (keys %BarSeq2idn) {
47 my $fname = join('.',$outp,$BarSeq2idn{$k}->[0],$BarSeq2idn{$k}->[1],'1.fq.gz');
49 open $fha,'|-',"gzip -9c >$fname" or die "Error opening output $fname:$!\n";
50 $fname = join('.',$outp,$BarSeq2idn{$k}->[0],$BarSeq2idn{$k}->[1],'2.fq.gz');
51 open $fhb,'|-',"gzip -9c >$fname" or die "Error opening output $fname:$!\n";
52 ($BarSeq2idn{$k}->[2],$BarSeq2idn{$k}->[3]) = ($fha,$fhb);
54 #open $UNKNOWN,'|-',"gzip -9c >$outp.NA.Unknown.fq.gz" or die "Error opening output file:$!\n";
57 my ($barseq,$maxmark,$barQ)=@_;
58 return [$BarSeq2idn{$barseq},'1;000000,0'] if (exists $BarSeq2idn{$barseq});
59 my $ret = [$BarSeq2idn{'N'},''];
60 my @seqss = split //,$barseq;
61 #my @seqQ = @{getQvaluesFQ($barQ)};
62 my ($minmark,$mismark,$i,%mark2i,%marks,$ratio)=(999999);
64 if ( scalar @seqss < $BARLEN ) {
65 $ret->[1] = -1 * scalar @seqss;
68 for ($i=0; $i <= $#BarSeq; ++$i) {
70 my @thisMarks=(0,0,0,0,0,0); # (N,Cross[AC,GT],nonCross) by (bar,ec)
72 for (my $j=0; $j <= $EseqAfter; ++$j) {
73 if ($seqss[$j] eq $BarSeqs[$i][$j]) {
75 } elsif ($seqss[$j] eq 'N') {
78 } elsif ( $tmp = $seqss[$j].$BarSeqs[$i][$j] and ($tmp eq 'AC' or $tmp eq 'CA' or $tmp eq 'GT' or $tmp eq 'TG') ) {
82 #my $rateT = 10^($seqQ[$i]/10); # 1/err Q=-10*lg(err), err = 10^(-Q/10) [err*1/3]
87 for (my $j=$EseqAfter+1; $j <= $#seqss; ++$j) {
88 if ($seqss[$j] eq $BarSeqs[$i][$j]) {
90 } elsif ($seqss[$j] eq 'N') {
93 } elsif ( $tmp = $seqss[$j].$BarSeqs[$i][$j] and ($tmp eq 'AC' or $tmp eq 'CA' or $tmp eq 'GT' or $tmp eq 'TG') ) {
97 #my $rateTp = 10^(($seqQ[$i]/10)-2);
102 $mark2i{$mismark} = $i;
103 $thisMarkC{$mismark} = \
@thisMarks;
105 $minmark = $mismark if $minmark > $mismark;
107 my @thisMarks = @
{$thisMarkC{$minmark}};
108 my $misStr = join('',@thisMarks);
109 $ret->[1] = "$marks{$minmark};$misStr,$minmark"; # Well, string is also scalar to print.
110 if ($marks{$minmark} == 1) {
112 for my $i (0 .. $#thisMarks) {
113 if ($thisMarks[$i] > $$maxmark[$i]) {
118 unless ($flag) { # $flag == 0
119 $ret = [$BarSeq2idn{$BarSeq[$mark2i{$minmark}]},"1;$misStr,$minmark"];
125 my $fha=openfile
($fq1f);
126 my $fhb=openfile
($fq2f);
128 my ($Count1,$Count2,$CountPairs)=(0,0,0);
129 my ($dat1,$dat2,$qbar);
132 $dat1 = readfq
($fha, \
@aux1); # [$name, $comment, $seq, $qual]
133 $dat2 = readfq
($fhb, \
@aux2);
134 if ($dat1 && $dat2) {
135 $$dat1[1] = (split / \|/,$$dat1[1])[0];
136 $$dat2[1] = (split / \|/,$$dat2[1])[0];
137 my $bar = substr $$dat1[2],0,$BARLEN;
138 #$qbar = substr $$dat1[3],0,$BARLEN;
139 my ($kret,$themark) = @
{CmpBarSeq
($bar,\
@maxMis,$qbar)};
140 ++$Ret1Stat{$themark};
141 my $fha = $kret->[2];
142 my $fhb = $kret->[3];
143 print $fha '@',join("\n",
144 join(' ',@
$dat1[0,1],"|",$kret->[0],$themark,$kret->[1]),
145 $$dat1[2],'+',$$dat1[3]),"\n";
146 print $fhb '@',join("\n",
147 join(' ',@
$dat2[0,1],"|",$kret->[0],$kret->[1]),
148 $$dat2[2],'+',$$dat2[3]),"\n";
150 $kret->[5] += length($$dat2[2]) + length($$dat2[2]);
154 print "1-",join('|',@
$dat1),"\n";
157 print "2-",join('|',@
$dat2),"\n";
165 my $str = "Out Pairs: $CountPairs\nFQ1 over hang: $Count1\nFQ2 over hang: $Count2\n
166 BarSeq\tID\tName\tRead_Pairs\tPE_Bases\tRatio\n";
171 { if ($a eq 'N') { return 1; } elsif ($b eq 'N') { return -1; } else { return $BarSeq2idn{$a}[0] cmp $BarSeq2idn{$b}[0]; }
172 } keys %BarSeq2idn) {
173 close $BarSeq2idn{$k}->[2];
174 close $BarSeq2idn{$k}->[3];
175 $str = join("\t",$k,@
{$BarSeq2idn{$k}}[0,1,4,5],$BarSeq2idn{$k}->[4]/$CountPairs)."\n";
180 print LOG
"\nTypes\n";
182 (split /;/,$a)[0] <=> (split /;/,$b)[0]
184 $Ret1Stat{$b} <=> $Ret1Stat{$a}
188 print LOG
join("\t",$k,$Ret1Stat{$k},$Ret1Stat{$k}/$CountPairs),"\n";
189 $Sum += $Ret1Stat{$k} if (split /;/,$k)[0] eq '1';
191 my $t = join('',"\nUnique_Ratio: $Sum / $CountPairs = ",$Sum/$CountPairs,"\n");