can download plant phenotype data in the same way as plot phenotype data
[sgn.git] / lib / SGN / IntronFinder / Homology.pm
blob76a98add7c1d3bc83c551746facf7e4d0ef33b93
1 package SGN::IntronFinder::Homology;
3 use strict;
4 use warnings;
6 use Carp;
7 use FindBin;
8 use POSIX qw(ceil);
9 use File::Spec;
11 use Memoize;
13 use Getopt::Std;
14 use Pod::Usage;
16 use Bio::Range;
17 use Bio::SearchIO;
18 use Bio::SeqIO;
20 #use Data::Dumper;
22 sub find_introns_txt {
23 my ( $in_fh, $out_fh, $max_evalue, $gene_feature_file, $tempfile_dir,
24 $protein_db_base )
25 = @_;
27 # validate our input sequences and copy them to a temp file
28 my $seq_in = Bio::SeqIO->new( -fh => $in_fh, -format => 'fasta' );
29 die "Invalid input FASTA sequence " unless $seq_in;
31 my $temp_seq = File::Temp->new(
32 DIR => $tempfile_dir,
33 TEMPLATE => $FindBin::Script . '-seq_in-XXXXXX',
34 UNLINK => 0,
36 $temp_seq->close;
37 my $seq_out_temp = Bio::SeqIO->new(
38 -file => '>' . $temp_seq->filename,
39 -format => 'fasta',
41 my $seq_count = 0;
42 while ( my $seq = $seq_in->next_seq ) {
43 $seq_count++;
44 $seq_out_temp->write_seq($seq);
46 $seq_in = $seq_out_temp = undef; #< undef the seqIOs to close and
47 # flush their files
48 $seq_count or confess "must provide at least one input sequence\n";
50 # run blastall, feeding sequences on its STDIN and STDOUT
51 my $blast_out = File::Temp->new(
52 DIR => $tempfile_dir,
53 TEMPLATE => $FindBin::Script . '-blast_report-XXXXXX',
54 UNLINK => 1,
56 $blast_out->close; #< can't use the fh
58 glob $protein_db_base.'.*'
59 or confess "BLAST db $protein_db_base.* not present on disk";
61 my $cmd =
62 "blastall -d '$protein_db_base' -p 'blastx' -e $max_evalue < $temp_seq 2>&1 > $blast_out";
63 system $cmd;
64 if( $? ) {
65 die "$! (\$?: $?) running $cmd: "
66 .do {
67 local $/;
68 if( open my $f, $blast_out ) {
69 <$f>
70 } else {
76 # go through the blast report
77 my $blast_in = Bio::SearchIO->new(
78 -format => 'blast',
79 -file => $blast_out->filename,
82 # %unigenes maps the original identifier (EST) to an array (sequence
83 # of unigene, unigene id, EST_hqi start coord (on EST), EST length,
84 # EST sequence, EST start on unigene).
85 while ( my $result = $blast_in->next_result ) {
87 my $results_with_introns = 0;
89 my $query = $result->query_name;
91 $out_fh->print("Results for $query:\n\n");
93 while ( my $hit = $result->next_hit ) {
94 while ( my $hsp = $hit->next_hsp ) {
96 #check annotation of tair database for exon positions
97 #these are intron positions on protein, offset by where our hit starts.
98 #i.e. if the protein had an intron excised after the third amino acid, but our hit starts from the second amino acid, aa_intron_positions will contain 2.
99 #may contain fractions (introns that divide codons)
100 my @aa_intron_positions = checkIntrons(
101 $gene_feature_file, $hit->name,
102 $hsp->start('hit'), $hsp->end('hit')
105 # print ("aa intron positions: \n");
106 # printArray(@aa_intron_positions);
108 #index in hit_string (aa) of real positions
109 my @aa_visual_positions =
110 map { convertRealToVisual( $hsp->hit_string, $_ ); }
111 @aa_intron_positions;
113 # printArray(@aa_visual_positions);
114 #index in query_string(aa) of real positions
116 my @query_aa_real_positions =
117 map { convertVisualToReal( $hsp->query_string, $_ ); }
118 @aa_visual_positions;
120 #my @query_aa_real_positions = @aa_visual_positions;
121 # printArray(@query_aa_real_positions);
123 my @cdna_intron_relative_positions =
124 map { $_ * 3 } @query_aa_real_positions;
126 # printArray(@cdna_intron_relative_positions);
128 my $reverse_orientation = 0;
130 my $blast_frame =
131 ( $hsp->query->frame + 1 ) * $hsp->query->strand;
133 if ( $blast_frame < 0 ) {
134 $reverse_orientation = 1;
137 my @cdna_intron_absolute_positions;
138 if ($reverse_orientation) {
139 @cdna_intron_absolute_positions =
140 map { $hsp->end('query') - $_ - 1; }
141 @cdna_intron_relative_positions;
143 else {
144 @cdna_intron_absolute_positions =
145 map { $_ + $hsp->start('query') - 1; }
146 @cdna_intron_relative_positions;
149 # make sure we don't get intron positions like 45.99999999999
150 @cdna_intron_absolute_positions =
151 map { roundToInt($_) } @cdna_intron_absolute_positions;
153 if ( @aa_visual_positions
154 ) #make sure we have at least one intron in the alignment to print
156 $results_with_introns++; # results with introns count
157 if ($reverse_orientation) {
158 $out_fh->print("Frame for this query is negative.\n");
161 $out_fh->print(
162 @aa_visual_positions . " introns found in region.\n" );
164 displayAlignments( $out_fh, $query, $hsp, $hit,
165 $reverse_orientation );
167 displayIntrons( $out_fh, \@cdna_intron_absolute_positions,
168 \@aa_visual_positions, $hsp->query_string,
169 $hsp->hit_string, );
170 $out_fh->print("\n");
173 # print "\n";
176 $out_fh->print(
177 "$results_with_introns results returned for query $query.\n\n");
181 ########## SUBROUTINES ################
183 # check if there is an intron in the given gene between start and end.
184 # start and end are amino acid #s, with first amino acid in protein = 1
185 # values will be offset from start, i.e. will return the number of aa.s
186 # expected between the value of start and a given intron, rather
187 # than absolute positions on the gene.
189 sub checkIntrons {
190 my ( $gene_feature_file, $geneid, $start, $end ) = @_;
192 my $features = _coding_region_index($gene_feature_file);
193 my $frec = $features->{$geneid}
194 or return;
195 my ( $coords, $orientation ) = @$frec;
196 my @coords = @$coords;
198 @coords = sort { $a <=> $b } @coords;
200 # get rid of contiguous coding regions that don't have introns
201 # between them
202 @coords = deleteContiguousExons(@coords);
204 # find # of amino acids coded by each coding region to do this, we
205 # have to play with the coordinates if the protein is reverse
206 # oriented
208 if ( $orientation eq "reverse" ) {
209 my $start_on_chrom = $coords[$#coords];
211 @coords =
212 sort { $a <=> $b }
213 map { ( $_ - $start_on_chrom ) * -1 } @coords;
215 my @aa_per_coding_region;
217 for ( my $j = 0 ; $j < @coords ; $j += 2 ) {
219 my $numaa = ( $coords[ $j + 1 ] - $coords[$j] + 1 ) / 3;
220 push @aa_per_coding_region, $numaa;
223 my @relevant_introns;
225 for ( my $j = 0, my $curTotal = 0 ; $j < @aa_per_coding_region ; $j++ ) {
226 $curTotal += $aa_per_coding_region[$j];
227 if ( $curTotal >= $end ) {
228 last;
230 elsif ( $curTotal > $start ) {
231 push @relevant_introns, $curTotal - $start + 1;
233 # print("total is $curTotal, start is $start\n");
236 return @relevant_introns;
239 # first time checkIntrons called, cache our gene coding regions for this file in memory. could have done this as a memoized subroutine
240 memoize('_coding_region_index');
242 sub _coding_region_index {
243 my ($gene_feature_file) = @_;
245 $gene_feature_file or die "must pass a gene feature file!\n";
246 open my $datafile, '<', $gene_feature_file
247 or die "couldn't open data file $gene_feature_file";
249 my %coding_features;
250 while (<$datafile>) {
251 my ( undef, $geneid, undef, $type, $start, $stop, $length,
252 $orientation ) = split;
253 next unless $type eq 'coding_region';
254 push @{ $coding_features{$geneid}[0] }, $start, $stop;
255 $coding_features{$geneid}[1] = $orientation;
258 return \%coding_features;
261 #remove cases where we have coding region coordinates that do not actually have an intron between them, such as (0, 15, 15, 16) is reduced to (0, 16)
262 sub deleteContiguousExons(@) {
263 my (@coords) = @_;
264 my @tempcoords;
266 for ( my $i = 0 ; ( $i < @coords ) ; $i++ ) {
267 if ( not( $coords[ $i + 1 ] ) || ( $coords[$i] != $coords[ $i + 1 ] ) )
269 push @tempcoords, $coords[$i];
271 else {
272 $i++;
275 return @tempcoords;
278 #check that the introns are within the est, rather than elsewhere on the unigene.
279 sub removeOutofboundsIntrons {
280 my ( $pos, $start, $end ) = @_;
281 if ( $pos > $start && $pos < $end ) {
282 return $pos;
284 else {
285 return ();
289 sub displayAlignments {
291 my ( $out_fh, $query, $hsp, $hit, $reverse_orientation ) = @_;
292 my $eststart;
294 my $hit_name =
295 $hit->name; #$opt_w ? makeArabidopsisURL( $hit->name ) : ( $hit->name );
297 my $query_start_pos =
298 $reverse_orientation
299 ? $hsp->end('query')
300 : $hsp->start('query');
302 #we have no info in the database, just the sequence from the input file. since we blasted with the
303 #EST sequence anyway, no need for arithmetic, just print hsp and query.
305 my $wslen = 20 - length($query);
307 $out_fh->print( $query . ":" );
309 $out_fh->print( ' ' x ( $wslen - length($query_start_pos) ) );
311 $out_fh->print( $query_start_pos . "|" );
312 $out_fh->print( $hsp->query_string . "\n" );
314 #$eststart = $hsp->start('query');
316 #blast results
317 $wslen = 20 - ( length( $hit->name ) + length( $hsp->start('hit') ) );
318 $out_fh->print( $hit_name . ":" );
319 $out_fh->print( ' ' x $wslen );
320 $out_fh->print( $hsp->start('hit') . "|" );
321 $out_fh->print( $hsp->hit_string . "\n" );
324 return $reverse_orientation;
327 sub displayIntrons {
328 my ( $out_fh, $intron_absolute_positions, $intron_visual_positions,
329 $query_string, $hit_string, )
330 = @_;
332 my @intron_visual_positions = @$intron_visual_positions;
333 my @intron_absolute_positions = @$intron_absolute_positions;
335 # map {print $_, " "} @intron_visual_positions;
337 my $intronstring = "possible introns: |";
338 my $positionstring = "positions: |";
340 my $current_visual_intron = shift @intron_visual_positions;
341 my $current_absolute_intron = shift @intron_absolute_positions;
343 for ( my $i = 1 ;
344 $current_visual_intron && $current_absolute_intron ; $i++ )
346 if ( $i >= $current_visual_intron ) {
347 $intronstring .= "^";
348 $positionstring .= $current_absolute_intron;
350 # print "printing intron at $i\n";
351 # print "cvi is $current_visual_intron\n";
353 for ( my $j = 1 ; $j < length $current_absolute_intron ; $j++ ) {
354 $intronstring .= " ";
355 $i++;
357 $current_visual_intron = shift @intron_visual_positions;
358 $current_absolute_intron = shift @intron_absolute_positions;
360 else {
361 $intronstring .= " ";
362 $positionstring .= " ";
366 $out_fh->print( $intronstring . "\n" . $positionstring . "\n" );
369 sub convertRealToVisual {
370 my ( $var1, $var2 ) = @_;
371 return convertRealVisual( $var1, $var2, 1 );
374 sub convertVisualToReal {
375 my ( $var1, $var2 ) = @_;
376 return convertRealVisual( $var1, $var2, 0 );
379 # what position on the string does the dna/aa index correspond to, or vice versa, given gaps in the string.
380 # probably easier to do with regexps but i cant find the solution right now.
381 sub convertRealVisual {
382 my ( $str, $pos, $r2v ) =
383 @_; #r2v is true if realtovisual, false if visualtoreal
384 my $visualpos = 0;
385 my $elem;
387 my @strarray = split( //, $str );
389 while ( $visualpos < $pos ) {
390 $elem = $strarray[ $visualpos++ ];
391 if ( $elem eq '-' ) {
392 if ($r2v) { $pos += 1; }
393 else { $pos -= 1; }
396 return ($pos);
399 sub printArray(@) {
400 my (@array) = @_;
401 map { print $_, " "; } @array;
402 print "\n";
405 sub makeArabidopsisURL($) {
406 my ($hit) = @_;
407 my $url =
408 "http://www.arabidopsis.org/servlets/Search?type=general&name=GENEID&action=detail&method=4&sub_type=gene";
409 $url =~ s/GENEID/$hit/;
410 $url = "<a href=$url>$hit</a>";
414 # get file name from path
417 sub getFileName($) {
418 my ($path) = @_;
419 my $filename;
420 ( $_, $_, $filename ) = File::Spec->splitpath($path);
421 return $filename;
424 sub roundToInt($) {
425 my ($num) = @_;
426 my $res = int( $num + .5 * ( $num <=> 0 ) );
428 return $res;