bin/bp_fastam9_to_table

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 fastm9_to_table  - turn FASTA -m 9 output into NCBI -m 9 tabular output
   6
   7 =head1 SYNOPSIS
   8
   9  fastm9_to_table [-e evaluefilter] [-b bitscorefilter] [--header] [-o outfile] inputfile1 inputfile2 ...
  10
  11 =head1 DESCRIPTION
  12
  13 Command line options:
  14   -e/--evalue   evalue    -- filter by evalue
  15   -b/--bitscore bitscore  -- filter by bitscore
  16   --header                -- boolean flag to print column header
  17   -o/--out                -- optional outputfile to write data,
  18                              otherwise will write to STDOUT
  19   -h/--help               -- show this documentation
  20
  21 Not technically a SearchIO script as this doesn't use any Bioperl
  22 components but is a useful and fast.  The output is tabular output
  23 with the standard NCBI -m9 columns.
  24
  25  queryname
  26  hit name
  27  percent identity
  28  alignment length
  29  number mismatches
  30  number gaps
  31  query start  (if on rev-strand start > end)
  32  query end
  33  hit start (if on rev-strand start > end)
  34  hit end
  35  evalue
  36  bit score
  37
  38 Additionally 3 more columns are provided
  39  fasta score
  40  sw-score
  41  percent similar
  42  query length
  43  hit length
  44  query gaps
  45  hit gaps
  46
  47 =head1 AUTHOR - Jason Stajich
  48
  49 Jason Stajich jason_at_bioperl-dot-org
  50
  51 =cut
  52
  53 use strict;
  54 use warnings;
  55 use Getopt::Long;
  56 my $hitsection = 0;
  57 my %data;
  58
  59 my ($evalue,$bitscore,$header,$outfile) = ( 10, 0);
  60 GetOptions(
  61            'b|bitscore|bits:f'   => \$bitscore,
  62            'e|evalue:f'          => \$evalue,
  63            'header'              => \$header,
  64            'o|out|outfile:s'     => \$outfile,
  65            'h|help'              => sub { exec('perldoc',$0); exit; }
  66            );
  67
  68 my $outfh;
  69 if( $outfile ) {
  70     open $outfh, '>', $outfile or die "Could not write file '$outfile': $!\n";
  71 } else {
  72     $outfh = \*STDOUT;
  73 }
  74
  75 # query start -- an0
  76 # query en    -- ax0
  77 # hit start   -- an1
  78 # hit end     -- ax1
  79
  80 my @fields = qw(qname hname percid alen mmcount gapcount
  81                 qstart qend hstart hend evalue score bits fs sw-score
  82                 percsim qlen hlen qgap hgap);
  83 print $outfh "#",uc(join("", map{ sprintf("%-10s",$_) } @fields)), "\n" if $header;
  84
  85 while(<>) {
  86     my $linestr = $_;
  87     if( /^\s*\d+>>>(\S+).+/ ) {
  88         $data{'qname'} = $1;
  89         if( /\-?\s+(\d+)\s+(aa|nt)\s+$/ ){
  90             $data{'qlen'} = $1;
  91         }
  92     } elsif( $hitsection && (/^>>>\Q$data{'qname'}/ || /^>>>/) ) {
  93         $hitsection = 0;
  94     } elsif( /^The best scores are:/ ) {
  95         $hitsection = 1;
  96     } elsif( /^\s+$/ ) {
  97     } elsif( $hitsection ) {
  98         if( s/^(\S+)\s+(.+)\(\s*(\d+)\)\s+// ) {
  99             my ($hit, $desc,$hitlen) = ($1,$2,$3);
 100             my ($dir) = ( s/^\[(r|f)\]\s+// );
 101             my @line = split(/\s+/,$_);
 102             $data{'hname'}    = $hit;
 103             $data{'hlen'}     = $hitlen;
 104             $data{'score'}    = shift @line;
 105             $data{'bits'}     = shift @line;
 106             $data{'evalue'}   = shift @line;
 107             $data{'percid'}   = shift @line;
 108
 109             $data{'percsim'}  = shift @line;
 110             $data{'sw-score'} = shift @line;
 111             $data{'alen'}     = shift @line;
 112             $data{'qstart'}   = shift @line;
 113             $data{'qend'}     = shift @line;
 114             $data{'pn0'}      = shift @line; # pn0
 115             $data{'px0'}      = shift @line; # px0
 116             $data{'hstart'}   = shift @line; # an1
 117             $data{'hend'}     = shift @line; # ax1
 118             $data{'pn1'}      = shift @line; # pn1
 119             $data{'px1'}      = shift @line; # px1
 120             # query + hit gaps
 121             $data{'qgap'}     = shift @line;
 122             $data{'hgap'}     = shift @line;
 123             $data{'gapcount'} = $data{'qgap'} + $data{'hgap'};
 124             $data{'fs'}       = shift @line;
 125
 126             $data{'mmcount'} = $data{'alen'} - ( int($data{'percid'} * $data{'alen'}) + $data{'gapcount'});
 127             #next if( $data{'evalue'} > $evalue ||
 128         #            $data{'bits'} < $bitscore );
 129
 130             for ( $data{'percid'}, $data{'percsim'} ) {
 131                 $_ = sprintf("%.2f",$_*100);
 132             }
 133             print $outfh join( "\t",map { $data{$_} } @fields),"\n";
 134         } else {
 135             # print STDERR "unrecognized line \n$linestr";
 136         }
 137     } else {
 138         # warn("skipping a line like this: $_");
 139     }
 140 }