modified: makefile
[GalaxyCodeBases.git] / tools / linkage / readblast.pl
blobdad77909b4d1793fab8b202aa7878e5f188dc30e
1 #!/bin/env perl
2 use strict;
3 use warnings;
5 unless (@ARGV > 0) {
6 print "perl $0 <max trim> <outfile> <infiles>\n";
7 exit 0;
9 # infiles should be of :
10 # -evalue 3e-11 -best_hit_overhang 0.25 -best_hit_score_edge 0.05 -outfmt '6 qseqid sseqid pident length nident mismatch gapopen qstart qend sstart send evalue bitscore btop'
11 # since each marker can only in one blast result file, we can split by chr.
12 # For PA64 g=347534994(non-N), 0.01*1/g=2.877e-11. Alpha=0.96 %
14 my $maxTrim=shift;
15 my $out_file = shift;
16 my @Sinf;
18 sub getO($$$) {
19 my ($col,$m,$arr)=@_;
20 #my @A=sort {$a->[$col] <=> $b->[$col]} @$arr;
21 my %A;
22 push @{$A{ $_->[$col] }},$_ for @$arr;
23 my $id=(sort {$a <=> $b} keys %A)[$m]; # ASC
24 return $A{$id};
26 sub pasteSinf() {
27 #my ()=@_;
28 for (@Sinf) {
29 my ($Qid,$Sid,$Pidentity,$AlnLen,$identical,$mismatches,$Gap,$Qs,$Qe,$Ss,$Se,$E,$bitScore,$BTOP)=@$_;
30 my @t=split //,$BTOP;
31 for (0..$#t-1) {
32 ++$identical if $t[$_] =~ /[MRWSYK]/i and $t[$_+1] ne '-';
34 $_=[$Qid,$Sid,$Pidentity,$AlnLen,$identical,$mismatches,$Gap,$Qs,$Qe,$Ss,$Se,$E,$bitScore,$BTOP];
36 my $A=&getO(4,-1,\@Sinf); # by col.4 identical, max
37 $A=&getO(11,0,$A); # by col.11 E, min
38 return [] if $#$A > 0; # if still more than 1, is duplicated
39 return $$A[0];
42 if (-s $out_file) {
43 die "[x]$out_file exists, remove first !\n";
44 } else {
45 warn "[!]Open $out_file for write.\n";
47 open O,'>',$out_file or die $!;
48 open LOG,'>',$out_file.'.log' or die $!;
49 while (my $in_file=shift) {
50 print LOG ">$in_file\t";
51 open I,'<',$in_file or warn "\n[!]Error opening $in_file: $!\n";
52 my ($lastQid,$MarkerLen);
53 while (<I>) {
54 next if /^#/;
55 # Fields: query id, subject id, % identity, alignment length, identical, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score, BTOP
56 #-outfmt '6 qseqid sseqid pident length nident mismatch gapopen qstart qend sstart send evalue bitscore btop'
57 chomp;
58 my ($Qid,$Sid,$Pidentity,$AlnLen,$identical,$mismatches,$Gap,$Qs,$Qe,$Ss,$Se,$E,$bitScore,$BTOP)=split /\t/;
59 my @Dat=($Qid,$Sid,$Pidentity,$AlnLen,$identical,$mismatches,$Gap,$Qs,$Qe,$Ss,$Se,$E,$bitScore,$BTOP);
60 if ($lastQid and ($lastQid ne $Qid)) {
61 #print STDERR "-";
62 my @t=@{&pasteSinf()};
63 goto tNEXT if $#t == -1;
64 my ($Qs,$Qe)=@t[7,8]; # @{$t[0]}[7,8]
65 if ( ($Qs > $maxTrim) or (($MarkerLen-$Qe)> $maxTrim) ) {
66 print LOG "|$Qs,$Qe\n";
67 goto tNEXT;
68 } else {
69 print LOG "-";
71 print O join("\t",@t,scalar @Sinf),"\n";
72 tNEXT: @Sinf=(\@Dat);
73 } else {
74 push @Sinf,\@Dat;
76 $lastQid=$Qid;
77 $MarkerLen=1 + 2*(split /m/,$Qid)[-1];
79 close I;
80 print LOG ".\n";
82 close LOG;
83 close O;
85 __END__
86 ./readblast.pl 15 t.out ex32Chr01.pa64m7e
87 cat chrorder|while read a;do ./readblast.pl 15 ./out/f45$a.pa64 ./out/ex45$a.pa64m6;done # 2> ./out/f45$a.pa64.log
89 perl -lane 'BEGIN {my %d} ++$d{$F[1]}; END {print "$_\t$d{$_}" for sort {$d{$a} <=> $d{$b}} keys %d;}' ./out/f45*.pa64 | tee stat.pa64
90 grep Scaffold stat.pa64 |perl -lane 'BEGIN {my %a;my $b} ++$a{$F[1]};$b += $F[1]; END {print "$_\t$a{$_}" for sort {$a<=>$b} keys %a; print "\n$b"}'
92 #!/bin/sh
93 #$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH,BLASTDB
94 #$ -cwd -r y -l vf=900m
95 #$ -o /dev/null -e /dev/null
96 #$ -S /bin/bash -t 1-12
97 SEEDFILE=./doblast.cmd
98 SEED=$(sed -n -e "$SGE_TASK_ID p" $SEEDFILE)
99 export BLASTDB=/ifs1/POPULATION/Rice/gapfilling/blast
100 eval ./blastn -task megablast -db PA64scaf -outfmt \'6 qseqid sseqid pident length nident mismatch gapopen qstart qend sstart send evalue bitscore btop\' -num_threads 2 -evalue 3e-11 -best_hit_overhang 0.25 -best_hit_score_edge 0.05 $SEED
102 $ cat doblast.cmd
103 -query ./marker/ex45Chr01.marker -out ./out/ex45Chr01.pa64m6
104 -query ./marker/ex45Chr02.marker -out ./out/ex45Chr02.pa64m6