Changes: more detailed complete list of modules/dependencies added/removed
[bioperl-live.git] / examples / align / FastAlign.pl
blob2ab98e340b9b333a43d72f7d1b9ed807bd732d84
1 #! /usr/bin/perl
2 #####################################################
3 # Fasta
4 # |
5 # Align
7 # By
8 # Antony Vincent
9 # (a.vincent.0312@gmail.com)
11 # FastAlign is a perl script which uses the heuristic method
12 # of tfasty36 for find similarity between a query sequence
13 # in amino acids and a sequence in nucleotides. It provides
14 # a more intuitive output to find exon-intron junctions.
15 # The query string is in amino acids and the hit string is
16 # in nucleotides. There are extra nucleotides at the end of
17 # the hit string (option -diff and by default = 10), that
18 # allow to verify if the intron start with common rules
19 # (5'-GTGCGA-... for group II intron and after an exonic T
20 # for group I intron).
22 # The FASTA version can be changed by the user by changing
23 # the line with tfasty36 for tfastyXX.
25 # If you have Emboss, you can genarate a graphic with option
26 # -graph 1.
28 # For complete help: type perl fastalign.pl -help
29 # Last Update: 01/06/13
30 #######################################################
33 =head1
35 Copyright (C) 2013 Antony Vincent
37 Licence:
39 This program is free software: you can redistribute it and/or modify
40 it under the terms of the GNU General Public License as published by
41 the Free Software Foundation, either version 3 of the License, or
42 (at your option) any later version.
44 This program is distributed in the hope that it will be useful,
45 but WITHOUT ANY WARRANTY; without even the implied warranty of
46 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
47 GNU General Public License for more details.
49 You should have received a copy of the GNU General Public License
50 along with this program. If not, see <http://www.gnu.org/licenses/>.
52 =cut
54 use strict;
55 use Bio::SearchIO;
56 use Bio::SeqIO;
57 use Getopt::Long;
58 use Bio::SeqUtils;
61 ## Set the default variables
62 my $identity = 75;
63 my $length = 50;
64 my $diff = 10;
65 my $out = 'output';
66 my $graphic = 10;
67 my $query;
68 my $library;
69 my $help;
71 GetOptions(
72 'seq=s' => \$query,
73 'db=s' => \$library,
74 'graph=s' => \$graphic,
75 'i=i' => \$identity,
76 'l=i' => \$length,
77 'diff=s' => \$diff,
78 'out=s' => \$out,
79 'help!' => \$help,
81 or die "Incorrect usage! Try perl fastalign.pl -help for an exhaustif help.\n";
82 ###
83 if( $help )
84 { # if start
85 print "\n";
86 print "Two options are required:\n";
87 print " -seq: Your sequence in amino acids\n";
88 print " -db: The sequence in nucleotides (Could be a whole genome or a partial sequence...)\n";
89 print "\n";
90 print "There are few miscellaneous options:\n";
91 print " -i: Minimum identity percentage (default = 75)\n";
92 print " -l: Minimum match length (default = 50)\n";
93 print " -diff: Difference between the hit string and the alignement (default = 10)\n";
94 print " -out: Name of the output file (default = output.txt)\n";
95 print " -graph: If this option = 1, a graph will be generated (default = no)\n";
96 } # if help
98 else
99 { # else start
100 my $date = `date`;
101 open (WRITE, ">>$out.txt"); ## Open the output file
102 print WRITE " Fasta\n";
103 print WRITE " |\n";
104 print WRITE " Align\n\n";
105 print WRITE "Date:", $date, "\n";
106 print WRITE "PARAMETERS\n";
107 print WRITE "Minimum identity =", $identity, "\n";
108 print WRITE "Minimum length =", $length, "\n";
109 print WRITE "Diff =", $diff, "\n\n";
111 if ( $graphic == 1 )
113 open (WRITE, ">>$out.txt"); ## Open the output file
114 open (WRITE2, ">>lindna.lnp"); ## Open the lindna config file
116 ## start the lindna header
117 print WRITE2 "start";
118 print WRITE2 "\t";
119 print WRITE2 "1";
120 print WRITE2 "\n";
121 print WRITE2 "End";
122 print WRITE2 "\t";
123 my $seqio_obj = Bio::SeqIO->new(-file => "$library", -format => "fasta" );
124 my $seq_obj = $seqio_obj->next_seq;
125 my $count_obj = $seq_obj->length;
126 print WRITE2 "$count_obj";
127 print WRITE2 "\n\n";
128 print WRITE2 "group";
129 print WRITE2 "\n";
131 else
133 print "No graphic generated \n";
135 ## run tfasty36
136 my $fh;
137 my $fasta = "tfasty36"; # <------ You can change this line for newest fasta algorithm
139 my $command = "$fasta $query $library";
141 open $fh,"$command |" || die("cannot run fasta cmd of $command: $!\n");
143 my $searchio = Bio::SearchIO->new(-format => 'fasta', -fh => $fh);
144 print WRITE "Fasta algorithm:", $fasta, "\n\n";
145 ## start the parsing part of the script
147 while( my $result = $searchio->next_result ) {
148 ## $result is a Bio::Search::Result::ResultI compliant object
149 while( my $hit = $result->next_hit ) {
150 ## $hit is a Bio::Search::Hit::HitI compliant object
151 while( my $hsp = $hit->next_hsp ) {
152 ## $hsp is a Bio::Search::HSP::HSPI compliant object
153 if( $hsp->length('total') > $length ) {
154 if ( $hsp->percent_identity >= $identity ) {
156 ## Generals informations
157 print WRITE "Rank=", $hsp->rank, "\n",
158 "Query=", $result->query_name, "\n",
159 "Hit=", $hit->name, "\n" ,
160 "Length=", $hsp->length('total'), "\n",
161 "Percent_id=", $hsp->percent_identity, "\n",
162 "Strand=", $hsp->strand('hit'), "\n";
165 print WRITE "\n";
170 ## Search for nucleotide sequences
171 print WRITE "\n";
172 my $start_hit = $hsp->start('hit'), "\n";
173 my $end_hit = $hsp->end('hit') , "\n";
174 my $in = Bio::SeqIO->new(-file => "$library" , '-format' => 'fasta');
176 while ( my $seq = $in->next_seq() ) {#1
178 ## looking for query position
179 my $start_query = $hsp->start('query'), "\n";
180 my $end_query = $hsp->end('query') , "\n";
181 ## aa_to_3aa
182 my $seqobj = Bio::PrimarySeq->new ( -seq => $hsp->query_string);
183 my $polypeptide_3char = Bio::SeqUtils->seq3($seqobj);
185 ## modify the homology string
186 my $homo = $hsp->homology_string;
187 $homo =~ s/:/|||/g;
188 $homo =~ s/\./***/g;
189 $homo =~ s/ /XXX/g;
192 ## HSP
194 print WRITE "Query($start_query,$end_query)\n";
195 print WRITE "Hit($start_hit,$end_hit)\n\n";
196 print WRITE $polypeptide_3char, "\n";
197 print WRITE $homo, "\n";
198 print WRITE $seq->subseq($start_hit,$end_hit+$diff), "\n";
201 if ( $graphic == 1)
202 { ## if start
203 ## write in lindna file
204 print WRITE2 "label", "\n", "Block", "\t", "$start_hit", "\t",
205 "$end_hit", "\t", "3", "\t", "H", "\n";
206 print WRITE2 "Exon", $hsp->rank, "\n";
207 print WRITE2 "endlabel";
208 print WRITE2 "\n\n";
209 } ## if end
210 else
211 {print "No lindna file generated\n";}
212 } #1
213 print WRITE "\n";
220 if ( $graphic == 1)
221 { ## if start
222 print WRITE2 "endgroup";
223 system ("lindna -infile lindna.lnp -ruler y -blocktype filled -graphout svg");
224 system ("rm *.lnp");
225 } ## if end
226 } # else end