lib/SGN/IntronFinder/Homology.pm

   1 package SGN::IntronFinder::Homology;
   2
   3 use strict;
   4 use warnings;
   5
   6 use Carp;
   7 use FindBin;
   8 use POSIX qw(ceil);
   9 use File::Spec;
  10
  11 use Memoize;
  12
  13 use Getopt::Std;
  14 use Pod::Usage;
  15
  16 use Bio::Range;
  17 use Bio::SearchIO;
  18 use Bio::SeqIO;
  19
  20 #use Data::Dumper;
  21
  22 sub find_introns_txt {
  23     my ( $in_fh, $out_fh, $max_evalue, $gene_feature_file, $tempfile_dir,
  24         $protein_db_base )
  25       = @_;
  26
  27     # validate our input sequences and copy them to a temp file
  28     my $seq_in = Bio::SeqIO->new( -fh => $in_fh, -format => 'fasta' );
  29     die "Invalid input FASTA sequence " unless $seq_in;
  30
  31     my $temp_seq = File::Temp->new(
  32         DIR      => $tempfile_dir,
  33         TEMPLATE => $FindBin::Script . '-seq_in-XXXXXX',
  34         UNLINK   => 0,
  35     );
  36     $temp_seq->close;
  37     my $seq_out_temp = Bio::SeqIO->new(
  38         -file   => '>' . $temp_seq->filename,
  39         -format => 'fasta',
  40     );
  41     my $seq_count = 0;
  42     while ( my $seq = $seq_in->next_seq ) {
  43         $seq_count++;
  44         $seq_out_temp->write_seq($seq);
  45     }
  46     $seq_in = $seq_out_temp = undef;    #< undef the seqIOs to close and
  47                                         #  flush their files
  48     $seq_count or confess "must provide at least one input sequence\n";
  49
  50     # run blastall, feeding sequences on its STDIN and STDOUT
  51     my $blast_out = File::Temp->new(
  52         DIR      => $tempfile_dir,
  53         TEMPLATE => $FindBin::Script . '-blast_report-XXXXXX',
  54         UNLINK   => 1,
  55     );
  56     $blast_out->close;                  #< can't use the fh
  57
  58     glob $protein_db_base.'.*'
  59        or confess "BLAST db $protein_db_base.* not present on disk";
  60
  61     my $cmd =
  62 "blastall -d '$protein_db_base' -p 'blastx' -e $max_evalue < $temp_seq 2>&1 > $blast_out";
  63     system $cmd;
  64     if( $? ) {
  65         die "$! (\$?: $?) running $cmd: "
  66           .do {
  67               local $/;
  68               if( open my $f, $blast_out ) {
  69                   <$f>
  70               } else {
  71                   ''
  72               }
  73           };
  74     }
  75
  76     # go through the blast report
  77     my $blast_in = Bio::SearchIO->new(
  78         -format => 'blast',
  79         -file   => $blast_out->filename,
  80     );
  81
  82     # %unigenes maps the original identifier (EST) to an array (sequence
  83     # of unigene, unigene id, EST_hqi start coord (on EST), EST length,
  84     # EST sequence, EST start on unigene).
  85     while ( my $result = $blast_in->next_result ) {
  86
  87         my $results_with_introns = 0;
  88
  89         my $query = $result->query_name;
  90
  91         $out_fh->print("Results for $query:\n\n");
  92
  93         while ( my $hit = $result->next_hit ) {
  94             while ( my $hsp = $hit->next_hsp ) {
  95
  96 #check annotation of tair database for exon positions
  97 #these are intron positions on protein, offset by where our hit starts.
  98 #i.e. if the protein had an intron excised after the third amino acid, but our hit starts from the second amino acid, aa_intron_positions will contain 2.
  99 #may contain fractions (introns that divide codons)
 100                 my @aa_intron_positions = checkIntrons(
 101                     $gene_feature_file, $hit->name,
 102                     $hsp->start('hit'), $hsp->end('hit')
 103                 );
 104
 105                 #           print ("aa intron positions: \n");
 106                 #           printArray(@aa_intron_positions);
 107
 108                 #index in hit_string (aa) of real positions
 109                 my @aa_visual_positions =
 110                   map { convertRealToVisual( $hsp->hit_string, $_ ); }
 111                   @aa_intron_positions;
 112
 113                 #           printArray(@aa_visual_positions);
 114                 #index in query_string(aa) of real positions
 115
 116                 my @query_aa_real_positions =
 117                   map { convertVisualToReal( $hsp->query_string, $_ ); }
 118                   @aa_visual_positions;
 119
 120                 #my  @query_aa_real_positions = @aa_visual_positions;
 121                 #           printArray(@query_aa_real_positions);
 122
 123                 my @cdna_intron_relative_positions =
 124                   map { $_ * 3 } @query_aa_real_positions;
 125
 126                 #           printArray(@cdna_intron_relative_positions);
 127
 128                 my $reverse_orientation = 0;
 129
 130                 my $blast_frame =
 131                   ( $hsp->query->frame + 1 ) * $hsp->query->strand;
 132
 133                 if ( $blast_frame < 0 ) {
 134                     $reverse_orientation = 1;
 135                 }
 136
 137                 my @cdna_intron_absolute_positions;
 138                 if ($reverse_orientation) {
 139                     @cdna_intron_absolute_positions =
 140                       map { $hsp->end('query') - $_ - 1; }
 141                       @cdna_intron_relative_positions;
 142                 }
 143                 else {
 144                     @cdna_intron_absolute_positions =
 145                       map { $_ + $hsp->start('query') - 1; }
 146                       @cdna_intron_relative_positions;
 147                 }
 148
 149                 # make sure we don't get intron positions like 45.99999999999
 150                 @cdna_intron_absolute_positions =
 151                   map { roundToInt($_) } @cdna_intron_absolute_positions;
 152
 153                 if ( @aa_visual_positions
 154                   ) #make sure we have at least one intron in the alignment to print
 155                 {
 156                     $results_with_introns++;    # results with introns count
 157                     if ($reverse_orientation) {
 158                         $out_fh->print("Frame for this query is negative.\n");
 159                     }
 160
 161                     $out_fh->print(
 162                         @aa_visual_positions . " introns found in region.\n" );
 163
 164                     displayAlignments( $out_fh, $query, $hsp, $hit,
 165                         $reverse_orientation );
 166
 167                     displayIntrons( $out_fh, \@cdna_intron_absolute_positions,
 168                         \@aa_visual_positions, $hsp->query_string,
 169                         $hsp->hit_string, );
 170                     $out_fh->print("\n");
 171                 }
 172
 173                 #       print "\n";
 174             }
 175         }
 176         $out_fh->print(
 177             "$results_with_introns results returned for query $query.\n\n");
 178     }
 179 }
 180
 181 ########## SUBROUTINES ################
 182
 183 # check if there is an intron in the given gene between start and end.
 184 # start and end are amino acid #s, with first amino acid in protein = 1
 185 # values will be offset from start, i.e. will return the number of aa.s
 186 # expected between the value of start and a given intron, rather
 187 # than absolute positions on the gene.
 188 #
 189 sub checkIntrons {
 190     my ( $gene_feature_file, $geneid, $start, $end ) = @_;
 191
 192     my $features = _coding_region_index($gene_feature_file);
 193     my $frec     = $features->{$geneid}
 194       or return;
 195     my ( $coords, $orientation ) = @$frec;
 196     my @coords = @$coords;
 197
 198     @coords = sort { $a <=> $b } @coords;
 199
 200     # get rid of contiguous coding regions that don't have introns
 201     # between them
 202     @coords = deleteContiguousExons(@coords);
 203
 204     # find # of amino acids coded by each coding region to do this, we
 205     # have to play with the coordinates if the protein is reverse
 206     # oriented
 207
 208     if ( $orientation eq "reverse" ) {
 209         my $start_on_chrom = $coords[$#coords];
 210
 211         @coords =
 212           sort { $a <=> $b }
 213           map  { ( $_ - $start_on_chrom ) * -1 } @coords;
 214     }
 215     my @aa_per_coding_region;
 216
 217     for ( my $j = 0 ; $j < @coords ; $j += 2 ) {
 218
 219         my $numaa = ( $coords[ $j + 1 ] - $coords[$j] + 1 ) / 3;
 220         push @aa_per_coding_region, $numaa;
 221     }
 222
 223     my @relevant_introns;
 224
 225     for ( my $j = 0, my $curTotal = 0 ; $j < @aa_per_coding_region ; $j++ ) {
 226         $curTotal += $aa_per_coding_region[$j];
 227         if ( $curTotal >= $end ) {
 228             last;
 229         }
 230         elsif ( $curTotal > $start ) {
 231             push @relevant_introns, $curTotal - $start + 1;
 232
 233             #           print("total is $curTotal, start is $start\n");
 234         }
 235     }
 236     return @relevant_introns;
 237 }
 238
 239 # first time checkIntrons called, cache our gene coding regions for this file in memory.  could have done this as a memoized subroutine
 240 memoize('_coding_region_index');
 241
 242 sub _coding_region_index {
 243     my ($gene_feature_file) = @_;
 244
 245     $gene_feature_file or die "must pass a gene feature file!\n";
 246     open my $datafile, '<', $gene_feature_file
 247       or die "couldn't open data file $gene_feature_file";
 248
 249     my %coding_features;
 250     while (<$datafile>) {
 251         my ( undef, $geneid, undef, $type, $start, $stop, $length,
 252             $orientation ) = split;
 253         next unless $type eq 'coding_region';
 254         push @{ $coding_features{$geneid}[0] }, $start, $stop;
 255         $coding_features{$geneid}[1] = $orientation;
 256     }
 257
 258     return \%coding_features;
 259 }
 260
 261 #remove cases where we have coding region coordinates that do not actually have an intron between them, such as (0, 15, 15, 16) is reduced to (0, 16)
 262 sub deleteContiguousExons(@) {
 263     my (@coords) = @_;
 264     my @tempcoords;
 265
 266     for ( my $i = 0 ; ( $i < @coords ) ; $i++ ) {
 267         if ( not( $coords[ $i + 1 ] ) || ( $coords[$i] != $coords[ $i + 1 ] ) )
 268         {
 269             push @tempcoords, $coords[$i];
 270         }
 271         else {
 272             $i++;
 273         }
 274     }
 275     return @tempcoords;
 276 }
 277
 278 #check that the introns are within the est, rather than elsewhere on the unigene.
 279 sub removeOutofboundsIntrons {
 280     my ( $pos, $start, $end ) = @_;
 281     if ( $pos > $start && $pos < $end ) {
 282         return $pos;
 283     }
 284     else {
 285         return ();
 286     }
 287 }
 288
 289 sub displayAlignments {
 290
 291     my ( $out_fh, $query, $hsp, $hit, $reverse_orientation ) = @_;
 292     my $eststart;
 293
 294     my $hit_name =
 295       $hit->name;   #$opt_w ? makeArabidopsisURL( $hit->name ) : ( $hit->name );
 296
 297     my $query_start_pos =
 298         $reverse_orientation
 299       ? $hsp->end('query')
 300       : $hsp->start('query');
 301
 302 #we have no info in the database, just the sequence from the input file. since we blasted with the
 303 #EST sequence anyway, no need for arithmetic, just print hsp and query.
 304
 305     my $wslen = 20 - length($query);
 306
 307     $out_fh->print( $query . ":" );
 308
 309     $out_fh->print( ' ' x ( $wslen - length($query_start_pos) ) );
 310
 311     $out_fh->print( $query_start_pos . "|" );
 312     $out_fh->print( $hsp->query_string . "\n" );
 313
 314     #$eststart = $hsp->start('query');
 315
 316     #blast results
 317     $wslen = 20 - ( length( $hit->name ) + length( $hsp->start('hit') ) );
 318     $out_fh->print( $hit_name . ":" );
 319     $out_fh->print( ' ' x $wslen );
 320     $out_fh->print( $hsp->start('hit') . "|" );
 321     $out_fh->print( $hsp->hit_string . "\n" );
 322
 323     #    }
 324     return $reverse_orientation;
 325 }
 326
 327 sub displayIntrons {
 328     my ( $out_fh, $intron_absolute_positions, $intron_visual_positions,
 329         $query_string, $hit_string, )
 330       = @_;
 331
 332     my @intron_visual_positions   = @$intron_visual_positions;
 333     my @intron_absolute_positions = @$intron_absolute_positions;
 334
 335     #    map {print $_, " "} @intron_visual_positions;
 336
 337     my $intronstring   = "possible introns:    |";
 338     my $positionstring = "positions:           |";
 339
 340     my $current_visual_intron   = shift @intron_visual_positions;
 341     my $current_absolute_intron = shift @intron_absolute_positions;
 342
 343     for ( my $i = 1 ;
 344         $current_visual_intron && $current_absolute_intron ; $i++ )
 345     {
 346         if ( $i >= $current_visual_intron ) {
 347             $intronstring   .= "^";
 348             $positionstring .= $current_absolute_intron;
 349
 350             #       print "printing intron at $i\n";
 351             #       print "cvi is $current_visual_intron\n";
 352
 353             for ( my $j = 1 ; $j < length $current_absolute_intron ; $j++ ) {
 354                 $intronstring .= " ";
 355                 $i++;
 356             }
 357             $current_visual_intron   = shift @intron_visual_positions;
 358             $current_absolute_intron = shift @intron_absolute_positions;
 359         }
 360         else {
 361             $intronstring   .= " ";
 362             $positionstring .= " ";
 363         }
 364     }
 365
 366     $out_fh->print( $intronstring . "\n" . $positionstring . "\n" );
 367 }
 368
 369 sub convertRealToVisual {
 370     my ( $var1, $var2 ) = @_;
 371     return convertRealVisual( $var1, $var2, 1 );
 372 }
 373
 374 sub convertVisualToReal {
 375     my ( $var1, $var2 ) = @_;
 376     return convertRealVisual( $var1, $var2, 0 );
 377 }
 378
 379 # what position on the string does the dna/aa index correspond to, or vice versa, given gaps in the string.
 380 # probably easier to do with regexps but i cant find the solution right now.
 381 sub convertRealVisual {
 382     my ( $str, $pos, $r2v ) =
 383       @_;    #r2v is true if realtovisual, false if visualtoreal
 384     my $visualpos = 0;
 385     my $elem;
 386
 387     my @strarray = split( //, $str );
 388
 389     while ( $visualpos < $pos ) {
 390         $elem = $strarray[ $visualpos++ ];
 391         if ( $elem eq '-' ) {
 392             if ($r2v) { $pos += 1; }
 393             else      { $pos -= 1; }
 394         }
 395     }
 396     return ($pos);
 397 }
 398
 399 sub printArray(@) {
 400     my (@array) = @_;
 401     map { print $_, " "; } @array;
 402     print "\n";
 403 }
 404
 405 sub makeArabidopsisURL($) {
 406     my ($hit) = @_;
 407     my $url =
 408 "http://www.arabidopsis.org/servlets/Search?type=general&name=GENEID&action=detail&method=4&sub_type=gene";
 409     $url =~ s/GENEID/$hit/;
 410     $url = "<a href=$url>$hit</a>";
 411 }
 412
 413 #
 414 # get file name from path
 415 #
 416
 417 sub getFileName($) {
 418     my ($path) = @_;
 419     my $filename;
 420     ( $_, $_, $filename ) = File::Spec->splitpath($path);
 421     return $filename;
 422 }
 423
 424 sub roundToInt($) {
 425     my ($num) = @_;
 426     my $res = int( $num + .5 * ( $num <=> 0 ) );
 427
 428     return $res;
 429
 430 }
 431
 432 1;