bin/bp_biofetch_genbank_proxy: move to Bio-DB-NCBIHelper
[bioperl-live.git] / bin / bp_fastam9_to_table
blob87f0162f433dfba5af7a2d9768c6b016be77ae9a
1 #!/usr/bin/perl
3 =head1 NAME
5 fastm9_to_table - turn FASTA -m 9 output into NCBI -m 9 tabular output
7 =head1 SYNOPSIS
9 fastm9_to_table [-e evaluefilter] [-b bitscorefilter] [--header] [-o outfile] inputfile1 inputfile2 ...
11 =head1 DESCRIPTION
13 Command line options:
14 -e/--evalue evalue -- filter by evalue
15 -b/--bitscore bitscore -- filter by bitscore
16 --header -- boolean flag to print column header
17 -o/--out -- optional outputfile to write data,
18 otherwise will write to STDOUT
19 -h/--help -- show this documentation
21 Not technically a SearchIO script as this doesn't use any Bioperl
22 components but is a useful and fast. The output is tabular output
23 with the standard NCBI -m9 columns.
25 queryname
26 hit name
27 percent identity
28 alignment length
29 number mismatches
30 number gaps
31 query start (if on rev-strand start > end)
32 query end
33 hit start (if on rev-strand start > end)
34 hit end
35 evalue
36 bit score
38 Additionally 3 more columns are provided
39 fasta score
40 sw-score
41 percent similar
42 query length
43 hit length
44 query gaps
45 hit gaps
47 =head1 AUTHOR - Jason Stajich
49 Jason Stajich jason_at_bioperl-dot-org
51 =cut
53 use strict;
54 use warnings;
55 use Getopt::Long;
56 my $hitsection = 0;
57 my %data;
59 my ($evalue,$bitscore,$header,$outfile) = ( 10, 0);
60 GetOptions(
61 'b|bitscore|bits:f' => \$bitscore,
62 'e|evalue:f' => \$evalue,
63 'header' => \$header,
64 'o|out|outfile:s' => \$outfile,
65 'h|help' => sub { exec('perldoc',$0); exit; }
68 my $outfh;
69 if( $outfile ) {
70 open $outfh, '>', $outfile or die "Could not write file '$outfile': $!\n";
71 } else {
72 $outfh = \*STDOUT;
75 # query start -- an0
76 # query en -- ax0
77 # hit start -- an1
78 # hit end -- ax1
80 my @fields = qw(qname hname percid alen mmcount gapcount
81 qstart qend hstart hend evalue score bits fs sw-score
82 percsim qlen hlen qgap hgap);
83 print $outfh "#",uc(join("", map{ sprintf("%-10s",$_) } @fields)), "\n" if $header;
85 while(<>) {
86 my $linestr = $_;
87 if( /^\s*\d+>>>(\S+).+/ ) {
88 $data{'qname'} = $1;
89 if( /\-?\s+(\d+)\s+(aa|nt)\s+$/ ){
90 $data{'qlen'} = $1;
92 } elsif( $hitsection && (/^>>>\Q$data{'qname'}/ || /^>>>/) ) {
93 $hitsection = 0;
94 } elsif( /^The best scores are:/ ) {
95 $hitsection = 1;
96 } elsif( /^\s+$/ ) {
97 } elsif( $hitsection ) {
98 if( s/^(\S+)\s+(.+)\(\s*(\d+)\)\s+// ) {
99 my ($hit, $desc,$hitlen) = ($1,$2,$3);
100 my ($dir) = ( s/^\[(r|f)\]\s+// );
101 my @line = split(/\s+/,$_);
102 $data{'hname'} = $hit;
103 $data{'hlen'} = $hitlen;
104 $data{'score'} = shift @line;
105 $data{'bits'} = shift @line;
106 $data{'evalue'} = shift @line;
107 $data{'percid'} = shift @line;
109 $data{'percsim'} = shift @line;
110 $data{'sw-score'} = shift @line;
111 $data{'alen'} = shift @line;
112 $data{'qstart'} = shift @line;
113 $data{'qend'} = shift @line;
114 $data{'pn0'} = shift @line; # pn0
115 $data{'px0'} = shift @line; # px0
116 $data{'hstart'} = shift @line; # an1
117 $data{'hend'} = shift @line; # ax1
118 $data{'pn1'} = shift @line; # pn1
119 $data{'px1'} = shift @line; # px1
120 # query + hit gaps
121 $data{'qgap'} = shift @line;
122 $data{'hgap'} = shift @line;
123 $data{'gapcount'} = $data{'qgap'} + $data{'hgap'};
124 $data{'fs'} = shift @line;
126 $data{'mmcount'} = $data{'alen'} - ( int($data{'percid'} * $data{'alen'}) + $data{'gapcount'});
127 #next if( $data{'evalue'} > $evalue ||
128 # $data{'bits'} < $bitscore );
130 for ( $data{'percid'}, $data{'percsim'} ) {
131 $_ = sprintf("%.2f",$_*100);
133 print $outfh join( "\t",map { $data{$_} } @fields),"\n";
134 } else {
135 # print STDERR "unrecognized line \n$linestr";
137 } else {
138 # warn("skipping a line like this: $_");