2 #####################################################
9 # (a.vincent.0312@gmail.com)
11 # FastAlign is a perl script which uses the heuristic method
12 # of tfasty36 for find similarity between a query sequence
13 # in amino acids and a sequence in nucleotides. It provides
14 # a more intuitive output to find exon-intron junctions.
15 # The query string is in amino acids and the hit string is
16 # in nucleotides. There are extra nucleotides at the end of
17 # the hit string (option -diff and by default = 10), that
18 # allow to verify if the intron start with common rules
19 # (5'-GTGCGA-... for group II intron and after an exonic T
20 # for group I intron).
22 # The FASTA version can be changed by the user by changing
23 # the line with tfasty36 for tfastyXX.
25 # If you have Emboss, you can genarate a graphic with option
28 # For complete help: type perl fastalign.pl -help
29 # Last Update: 01/06/13
30 #######################################################
35 Copyright (C) 2013 Antony Vincent
39 This program is free software: you can redistribute it and/or modify
40 it under the terms of the GNU General Public License as published by
41 the Free Software Foundation, either version 3 of the License, or
42 (at your option) any later version.
44 This program is distributed in the hope that it will be useful,
45 but WITHOUT ANY WARRANTY; without even the implied warranty of
46 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
47 GNU General Public License for more details.
49 You should have received a copy of the GNU General Public License
50 along with this program. If not, see <http://www.gnu.org/licenses/>.
61 ## Set the default variables
74 'graph=s' => \
$graphic,
81 or die "Incorrect usage! Try perl fastalign.pl -help for an exhaustif help.\n";
86 print "Two options are required:\n";
87 print " -seq: Your sequence in amino acids\n";
88 print " -db: The sequence in nucleotides (Could be a whole genome or a partial sequence...)\n";
90 print "There are few miscellaneous options:\n";
91 print " -i: Minimum identity percentage (default = 75)\n";
92 print " -l: Minimum match length (default = 50)\n";
93 print " -diff: Difference between the hit string and the alignement (default = 10)\n";
94 print " -out: Name of the output file (default = output.txt)\n";
95 print " -graph: If this option = 1, a graph will be generated (default = no)\n";
101 open (WRITE
, ">>$out.txt"); ## Open the output file
102 print WRITE
" Fasta\n";
104 print WRITE
" Align\n\n";
105 print WRITE
"Date:", $date, "\n";
106 print WRITE
"PARAMETERS\n";
107 print WRITE
"Minimum identity =", $identity, "\n";
108 print WRITE
"Minimum length =", $length, "\n";
109 print WRITE
"Diff =", $diff, "\n\n";
113 open (WRITE
, ">>$out.txt"); ## Open the output file
114 open (WRITE2
, ">>lindna.lnp"); ## Open the lindna config file
116 ## start the lindna header
117 print WRITE2
"start";
123 my $seqio_obj = Bio
::SeqIO
->new(-file
=> "$library", -format
=> "fasta" );
124 my $seq_obj = $seqio_obj->next_seq;
125 my $count_obj = $seq_obj->length;
126 print WRITE2
"$count_obj";
128 print WRITE2
"group";
133 print "No graphic generated \n";
137 my $fasta = "tfasty36"; # <------ You can change this line for newest fasta algorithm
139 my $command = "$fasta $query $library";
141 open $fh,"$command |" || die("cannot run fasta cmd of $command: $!\n");
143 my $searchio = Bio
::SearchIO
->new(-format
=> 'fasta', -fh
=> $fh);
144 print WRITE
"Fasta algorithm:", $fasta, "\n\n";
145 ## start the parsing part of the script
147 while( my $result = $searchio->next_result ) {
148 ## $result is a Bio::Search::Result::ResultI compliant object
149 while( my $hit = $result->next_hit ) {
150 ## $hit is a Bio::Search::Hit::HitI compliant object
151 while( my $hsp = $hit->next_hsp ) {
152 ## $hsp is a Bio::Search::HSP::HSPI compliant object
153 if( $hsp->length('total') > $length ) {
154 if ( $hsp->percent_identity >= $identity ) {
156 ## Generals informations
157 print WRITE
"Rank=", $hsp->rank, "\n",
158 "Query=", $result->query_name, "\n",
159 "Hit=", $hit->name, "\n" ,
160 "Length=", $hsp->length('total'), "\n",
161 "Percent_id=", $hsp->percent_identity, "\n",
162 "Strand=", $hsp->strand('hit'), "\n";
170 ## Search for nucleotide sequences
172 my $start_hit = $hsp->start('hit'), "\n";
173 my $end_hit = $hsp->end('hit') , "\n";
174 my $in = Bio
::SeqIO
->new(-file
=> "$library" , '-format' => 'fasta');
176 while ( my $seq = $in->next_seq() ) {#1
178 ## looking for query position
179 my $start_query = $hsp->start('query'), "\n";
180 my $end_query = $hsp->end('query') , "\n";
182 my $seqobj = Bio
::PrimarySeq
->new ( -seq
=> $hsp->query_string);
183 my $polypeptide_3char = Bio
::SeqUtils
->seq3($seqobj);
185 ## modify the homology string
186 my $homo = $hsp->homology_string;
194 print WRITE
"Query($start_query,$end_query)\n";
195 print WRITE
"Hit($start_hit,$end_hit)\n\n";
196 print WRITE
$polypeptide_3char, "\n";
197 print WRITE
$homo, "\n";
198 print WRITE
$seq->subseq($start_hit,$end_hit+$diff), "\n";
203 ## write in lindna file
204 print WRITE2
"label", "\n", "Block", "\t", "$start_hit", "\t",
205 "$end_hit", "\t", "3", "\t", "H", "\n";
206 print WRITE2
"Exon", $hsp->rank, "\n";
207 print WRITE2
"endlabel";
211 {print "No lindna file generated\n";}
222 print WRITE2
"endgroup";
223 system ("lindna -infile lindna.lnp -ruler y -blocktype filled -graphout svg");