1 package SGN
::IntronFinder
::Homology
;
22 sub find_introns_txt
{
23 my ( $in_fh, $out_fh, $max_evalue, $gene_feature_file, $tempfile_dir,
27 # validate our input sequences and copy them to a temp file
28 my $seq_in = Bio
::SeqIO
->new( -fh
=> $in_fh, -format
=> 'fasta' );
29 die "Invalid input FASTA sequence " unless $seq_in;
31 my $temp_seq = File
::Temp
->new(
33 TEMPLATE
=> $FindBin::Script
. '-seq_in-XXXXXX',
37 my $seq_out_temp = Bio
::SeqIO
->new(
38 -file
=> '>' . $temp_seq->filename,
42 while ( my $seq = $seq_in->next_seq ) {
44 $seq_out_temp->write_seq($seq);
46 $seq_in = $seq_out_temp = undef; #< undef the seqIOs to close and
48 $seq_count or confess
"must provide at least one input sequence\n";
50 # run blastall, feeding sequences on its STDIN and STDOUT
51 my $blast_out = File
::Temp
->new(
53 TEMPLATE
=> $FindBin::Script
. '-blast_report-XXXXXX',
56 $blast_out->close; #< can't use the fh
58 glob $protein_db_base.'.*'
59 or confess
"BLAST db $protein_db_base.* not present on disk";
62 "blastall -d '$protein_db_base' -p 'blastx' -e $max_evalue < $temp_seq 2>&1 > $blast_out";
65 die "$! (\$?: $?) running $cmd: "
68 if( open my $f, $blast_out ) {
76 # go through the blast report
77 my $blast_in = Bio
::SearchIO
->new(
79 -file
=> $blast_out->filename,
82 # %unigenes maps the original identifier (EST) to an array (sequence
83 # of unigene, unigene id, EST_hqi start coord (on EST), EST length,
84 # EST sequence, EST start on unigene).
85 while ( my $result = $blast_in->next_result ) {
87 my $results_with_introns = 0;
89 my $query = $result->query_name;
91 $out_fh->print("Results for $query:\n\n");
93 while ( my $hit = $result->next_hit ) {
94 while ( my $hsp = $hit->next_hsp ) {
96 #check annotation of tair database for exon positions
97 #these are intron positions on protein, offset by where our hit starts.
98 #i.e. if the protein had an intron excised after the third amino acid, but our hit starts from the second amino acid, aa_intron_positions will contain 2.
99 #may contain fractions (introns that divide codons)
100 my @aa_intron_positions = checkIntrons
(
101 $gene_feature_file, $hit->name,
102 $hsp->start('hit'), $hsp->end('hit')
105 # print ("aa intron positions: \n");
106 # printArray(@aa_intron_positions);
108 #index in hit_string (aa) of real positions
109 my @aa_visual_positions =
110 map { convertRealToVisual
( $hsp->hit_string, $_ ); }
111 @aa_intron_positions;
113 # printArray(@aa_visual_positions);
114 #index in query_string(aa) of real positions
116 my @query_aa_real_positions =
117 map { convertVisualToReal
( $hsp->query_string, $_ ); }
118 @aa_visual_positions;
120 #my @query_aa_real_positions = @aa_visual_positions;
121 # printArray(@query_aa_real_positions);
123 my @cdna_intron_relative_positions =
124 map { $_ * 3 } @query_aa_real_positions;
126 # printArray(@cdna_intron_relative_positions);
128 my $reverse_orientation = 0;
131 ( $hsp->query->frame + 1 ) * $hsp->query->strand;
133 if ( $blast_frame < 0 ) {
134 $reverse_orientation = 1;
137 my @cdna_intron_absolute_positions;
138 if ($reverse_orientation) {
139 @cdna_intron_absolute_positions =
140 map { $hsp->end('query') - $_ - 1; }
141 @cdna_intron_relative_positions;
144 @cdna_intron_absolute_positions =
145 map { $_ + $hsp->start('query') - 1; }
146 @cdna_intron_relative_positions;
149 # make sure we don't get intron positions like 45.99999999999
150 @cdna_intron_absolute_positions =
151 map { roundToInt
($_) } @cdna_intron_absolute_positions;
153 if ( @aa_visual_positions
154 ) #make sure we have at least one intron in the alignment to print
156 $results_with_introns++; # results with introns count
157 if ($reverse_orientation) {
158 $out_fh->print("Frame for this query is negative.\n");
162 @aa_visual_positions . " introns found in region.\n" );
164 displayAlignments
( $out_fh, $query, $hsp, $hit,
165 $reverse_orientation );
167 displayIntrons
( $out_fh, \
@cdna_intron_absolute_positions,
168 \
@aa_visual_positions, $hsp->query_string,
170 $out_fh->print("\n");
177 "$results_with_introns results returned for query $query.\n\n");
181 ########## SUBROUTINES ################
183 # check if there is an intron in the given gene between start and end.
184 # start and end are amino acid #s, with first amino acid in protein = 1
185 # values will be offset from start, i.e. will return the number of aa.s
186 # expected between the value of start and a given intron, rather
187 # than absolute positions on the gene.
190 my ( $gene_feature_file, $geneid, $start, $end ) = @_;
192 my $features = _coding_region_index
($gene_feature_file);
193 my $frec = $features->{$geneid}
195 my ( $coords, $orientation ) = @
$frec;
196 my @coords = @
$coords;
198 @coords = sort { $a <=> $b } @coords;
200 # get rid of contiguous coding regions that don't have introns
202 @coords = deleteContiguousExons
(@coords);
204 # find # of amino acids coded by each coding region to do this, we
205 # have to play with the coordinates if the protein is reverse
208 if ( $orientation eq "reverse" ) {
209 my $start_on_chrom = $coords[$#coords];
213 map { ( $_ - $start_on_chrom ) * -1 } @coords;
215 my @aa_per_coding_region;
217 for ( my $j = 0 ; $j < @coords ; $j += 2 ) {
219 my $numaa = ( $coords[ $j + 1 ] - $coords[$j] + 1 ) / 3;
220 push @aa_per_coding_region, $numaa;
223 my @relevant_introns;
225 for ( my $j = 0, my $curTotal = 0 ; $j < @aa_per_coding_region ; $j++ ) {
226 $curTotal += $aa_per_coding_region[$j];
227 if ( $curTotal >= $end ) {
230 elsif ( $curTotal > $start ) {
231 push @relevant_introns, $curTotal - $start + 1;
233 # print("total is $curTotal, start is $start\n");
236 return @relevant_introns;
239 # first time checkIntrons called, cache our gene coding regions for this file in memory. could have done this as a memoized subroutine
240 memoize
('_coding_region_index');
242 sub _coding_region_index
{
243 my ($gene_feature_file) = @_;
245 $gene_feature_file or die "must pass a gene feature file!\n";
246 open my $datafile, '<', $gene_feature_file
247 or die "couldn't open data file $gene_feature_file";
250 while (<$datafile>) {
251 my ( undef, $geneid, undef, $type, $start, $stop, $length,
252 $orientation ) = split;
253 next unless $type eq 'coding_region';
254 push @
{ $coding_features{$geneid}[0] }, $start, $stop;
255 $coding_features{$geneid}[1] = $orientation;
258 return \
%coding_features;
261 #remove cases where we have coding region coordinates that do not actually have an intron between them, such as (0, 15, 15, 16) is reduced to (0, 16)
262 sub deleteContiguousExons
(@
) {
266 for ( my $i = 0 ; ( $i < @coords ) ; $i++ ) {
267 if ( not( $coords[ $i + 1 ] ) || ( $coords[$i] != $coords[ $i + 1 ] ) )
269 push @tempcoords, $coords[$i];
278 #check that the introns are within the est, rather than elsewhere on the unigene.
279 sub removeOutofboundsIntrons
{
280 my ( $pos, $start, $end ) = @_;
281 if ( $pos > $start && $pos < $end ) {
289 sub displayAlignments
{
291 my ( $out_fh, $query, $hsp, $hit, $reverse_orientation ) = @_;
295 $hit->name; #$opt_w ? makeArabidopsisURL( $hit->name ) : ( $hit->name );
297 my $query_start_pos =
300 : $hsp->start('query');
302 #we have no info in the database, just the sequence from the input file. since we blasted with the
303 #EST sequence anyway, no need for arithmetic, just print hsp and query.
305 my $wslen = 20 - length($query);
307 $out_fh->print( $query . ":" );
309 $out_fh->print( ' ' x
( $wslen - length($query_start_pos) ) );
311 $out_fh->print( $query_start_pos . "|" );
312 $out_fh->print( $hsp->query_string . "\n" );
314 #$eststart = $hsp->start('query');
317 $wslen = 20 - ( length( $hit->name ) + length( $hsp->start('hit') ) );
318 $out_fh->print( $hit_name . ":" );
319 $out_fh->print( ' ' x
$wslen );
320 $out_fh->print( $hsp->start('hit') . "|" );
321 $out_fh->print( $hsp->hit_string . "\n" );
324 return $reverse_orientation;
328 my ( $out_fh, $intron_absolute_positions, $intron_visual_positions,
329 $query_string, $hit_string, )
332 my @intron_visual_positions = @
$intron_visual_positions;
333 my @intron_absolute_positions = @
$intron_absolute_positions;
335 # map {print $_, " "} @intron_visual_positions;
337 my $intronstring = "possible introns: |";
338 my $positionstring = "positions: |";
340 my $current_visual_intron = shift @intron_visual_positions;
341 my $current_absolute_intron = shift @intron_absolute_positions;
344 $current_visual_intron && $current_absolute_intron ; $i++ )
346 if ( $i >= $current_visual_intron ) {
347 $intronstring .= "^";
348 $positionstring .= $current_absolute_intron;
350 # print "printing intron at $i\n";
351 # print "cvi is $current_visual_intron\n";
353 for ( my $j = 1 ; $j < length $current_absolute_intron ; $j++ ) {
354 $intronstring .= " ";
357 $current_visual_intron = shift @intron_visual_positions;
358 $current_absolute_intron = shift @intron_absolute_positions;
361 $intronstring .= " ";
362 $positionstring .= " ";
366 $out_fh->print( $intronstring . "\n" . $positionstring . "\n" );
369 sub convertRealToVisual
{
370 my ( $var1, $var2 ) = @_;
371 return convertRealVisual
( $var1, $var2, 1 );
374 sub convertVisualToReal
{
375 my ( $var1, $var2 ) = @_;
376 return convertRealVisual
( $var1, $var2, 0 );
379 # what position on the string does the dna/aa index correspond to, or vice versa, given gaps in the string.
380 # probably easier to do with regexps but i cant find the solution right now.
381 sub convertRealVisual
{
382 my ( $str, $pos, $r2v ) =
383 @_; #r2v is true if realtovisual, false if visualtoreal
387 my @strarray = split( //, $str );
389 while ( $visualpos < $pos ) {
390 $elem = $strarray[ $visualpos++ ];
391 if ( $elem eq '-' ) {
392 if ($r2v) { $pos += 1; }
401 map { print $_, " "; } @array;
405 sub makeArabidopsisURL
($) {
408 "http://www.arabidopsis.org/servlets/Search?type=general&name=GENEID&action=detail&method=4&sub_type=gene";
409 $url =~ s/GENEID/$hit/;
410 $url = "<a href=$url>$hit</a>";
414 # get file name from path
420 ( $_, $_, $filename ) = File
::Spec
->splitpath($path);
426 my $res = int( $num + .5 * ( $num <=> 0 ) );