examples/align/FastAlign.pl

   1 #! /usr/bin/perl
   2 #####################################################
   3 #                 Fasta
   4 #                     |
   5 #                     Align
   6 #
   7 #                     By
   8 #                 Antony Vincent
   9 #          (a.vincent.0312@gmail.com)
  10 #
  11 # FastAlign is a perl script which uses the heuristic method
  12 # of tfasty36 for find similarity between a query sequence
  13 # in amino acids and a sequence in nucleotides. It provides
  14 # a more intuitive output to find exon-intron junctions.
  15 # The query string is in amino acids and the hit string is
  16 # in nucleotides. There are extra nucleotides at the end of
  17 # the hit string (option -diff and by default = 10), that
  18 # allow to verify if the intron start with common rules
  19 # (5'-GTGCGA-... for group II intron and after an exonic T
  20 # for group I intron).
  21 #
  22 # The FASTA version can be changed by the user by changing
  23 # the line with tfasty36 for tfastyXX.
  24 #
  25 # If you have Emboss, you can genarate a graphic with option
  26 # -graph 1.
  27 #
  28 # For complete help: type perl fastalign.pl -help
  29 #              Last Update: 01/06/13
  30 #######################################################
  31
  32
  33 =head1
  34
  35 Copyright (C) 2013  Antony Vincent
  36
  37 Licence:
  38
  39     This program is free software: you can redistribute it and/or modify
  40     it under the terms of the GNU General Public License as published by
  41     the Free Software Foundation, either version 3 of the License, or
  42     (at your option) any later version.
  43
  44     This program is distributed in the hope that it will be useful,
  45     but WITHOUT ANY WARRANTY; without even the implied warranty of
  46     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  47     GNU General Public License for more details.
  48
  49     You should have received a copy of the GNU General Public License
  50     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  51
  52 =cut
  53
  54 use strict;
  55 use Bio::SearchIO;
  56 use Bio::SeqIO;
  57 use Getopt::Long;
  58 use Bio::SeqUtils;
  59
  60
  61 ## Set the default variables
  62         my $identity     = 75;
  63         my $length      = 50;
  64         my $diff      = 10;
  65         my $out      = 'output';
  66         my $graphic      = 10;
  67         my $query;
  68         my $library;
  69         my $help;
  70
  71 GetOptions(
  72     'seq=s'    => \$query,
  73     'db=s'     => \$library,
  74     'graph=s'     => \$graphic,
  75     'i=i'     => \$identity,
  76     'l=i'     => \$length,
  77     'diff=s'    => \$diff,
  78     'out=s'     => \$out,
  79     'help!'     => \$help,
  80 )
  81 or die "Incorrect usage! Try perl fastalign.pl -help for an exhaustif help.\n";
  82 ###
  83         if( $help )
  84 { # if start
  85     print "\n";
  86     print "Two options are required:\n";
  87     print "     -seq: Your sequence in amino acids\n";
  88     print "     -db: The sequence in nucleotides (Could be a whole genome or a partial sequence...)\n";
  89     print "\n";
  90     print "There are few miscellaneous options:\n";
  91     print "     -i: Minimum identity percentage (default = 75)\n";
  92     print "     -l: Minimum match length (default = 50)\n";
  93     print "     -diff: Difference between the hit string and the alignement (default = 10)\n";
  94     print "     -out: Name of the output file (default = output.txt)\n";
  95     print "     -graph: If this option = 1, a graph will be generated (default = no)\n";
  96 } # if help
  97
  98         else
  99 { # else start
 100 my $date = `date`;
 101 open (WRITE, ">>$out.txt"); ## Open the output file
 102 print WRITE "           Fasta\n";
 103 print WRITE "               |\n";
 104 print WRITE "               Align\n\n";
 105 print WRITE "Date:", $date, "\n";
 106 print WRITE "PARAMETERS\n";
 107 print WRITE "Minimum identity =", $identity, "\n";
 108 print WRITE "Minimum length =", $length, "\n";
 109 print WRITE "Diff =", $diff, "\n\n";
 110
 111         if ( $graphic == 1 )
 112 {
 113 open (WRITE, ">>$out.txt"); ## Open the output file
 114 open (WRITE2, ">>lindna.lnp"); ## Open the lindna config file
 115
 116 ## start the lindna header
 117 print WRITE2 "start";
 118 print WRITE2 "\t";
 119 print WRITE2 "1";
 120 print WRITE2 "\n";
 121 print WRITE2 "End";
 122 print WRITE2 "\t";
 123 my $seqio_obj = Bio::SeqIO->new(-file => "$library", -format => "fasta" );
 124 my $seq_obj = $seqio_obj->next_seq;
 125 my $count_obj = $seq_obj->length;
 126 print WRITE2 "$count_obj";
 127 print WRITE2 "\n\n";
 128 print WRITE2 "group";
 129 print WRITE2 "\n";
 130 }
 131         else
 132 {
 133         print "No graphic generated \n";
 134 }
 135 ## run tfasty36
 136 my $fh;
 137 my $fasta   = "tfasty36"; # <------ You can change this line for newest fasta algorithm
 138
 139 my $command = "$fasta $query $library";
 140
 141 open $fh,"$command |" || die("cannot run fasta cmd of $command: $!\n");
 142
 143 my $searchio  = Bio::SearchIO->new(-format => 'fasta', -fh => $fh);
 144 print WRITE "Fasta algorithm:", $fasta, "\n\n";
 145                         ## start the parsing part of the script
 146
 147 while( my $result = $searchio->next_result ) {
 148   ## $result is a Bio::Search::Result::ResultI compliant object
 149   while( my $hit = $result->next_hit ) {
 150     ## $hit is a Bio::Search::Hit::HitI compliant object
 151     while( my $hsp = $hit->next_hsp ) {
 152       ## $hsp is a Bio::Search::HSP::HSPI compliant object
 153       if( $hsp->length('total') > $length ) {
 154         if ( $hsp->percent_identity >= $identity ) {
 155
 156                         ## Generals informations
 157 print WRITE "Rank=", $hsp->rank, "\n",
 158             "Query=",   $result->query_name, "\n",
 159             "Hit=",        $hit->name, "\n" ,
 160             "Length=",     $hsp->length('total'), "\n",
 161             "Percent_id=", $hsp->percent_identity, "\n",
 162             "Strand=", $hsp->strand('hit'), "\n";
 163
 164
 165                         print WRITE "\n";
 166
 167
 168
 169
 170                         ## Search for nucleotide sequences
 171                         print WRITE "\n";
 172         my $start_hit = $hsp->start('hit'), "\n";
 173         my $end_hit = $hsp->end('hit') , "\n";
 174         my $in  = Bio::SeqIO->new(-file => "$library" , '-format' => 'fasta');
 175
 176     while ( my $seq = $in->next_seq() ) {#1
 177
 178                 ## looking for query position
 179                         my $start_query = $hsp->start('query'), "\n";
 180                         my $end_query = $hsp->end('query') , "\n";
 181                 ## aa_to_3aa
 182         my $seqobj = Bio::PrimarySeq->new ( -seq => $hsp->query_string);
 183         my $polypeptide_3char = Bio::SeqUtils->seq3($seqobj);
 184
 185                 ## modify the homology string
 186       my $homo = $hsp->homology_string;
 187          $homo =~ s/:/|||/g;
 188          $homo =~ s/\./***/g;
 189          $homo =~ s/ /XXX/g;
 190
 191
 192                         ## HSP
 193
 194         print WRITE "Query($start_query,$end_query)\n";
 195         print WRITE "Hit($start_hit,$end_hit)\n\n";
 196         print WRITE $polypeptide_3char, "\n";
 197         print WRITE $homo, "\n";
 198         print WRITE $seq->subseq($start_hit,$end_hit+$diff), "\n";
 199
 200
 201         if ( $graphic == 1)
 202 { ## if start
 203          ## write in lindna file
 204         print WRITE2 "label", "\n", "Block", "\t", "$start_hit", "\t",
 205                       "$end_hit", "\t", "3", "\t", "H", "\n";
 206         print WRITE2 "Exon", $hsp->rank, "\n";
 207         print WRITE2 "endlabel";
 208         print WRITE2 "\n\n";
 209 } ## if end
 210         else
 211 {print "No lindna file generated\n";}
 212 } #1
 213 print WRITE "\n";
 214
 215         }
 216       }
 217     }
 218   }
 219 }
 220         if ( $graphic == 1)
 221 { ## if start
 222 print WRITE2 "endgroup";
 223 system ("lindna -infile lindna.lnp -ruler y -blocktype filled -graphout svg");
 224 system ("rm *.lnp");
 225 } ## if end
 226 } # else end