Bio/PhyloNetwork*: move to another repo with same name.
[bioperl-live.git] / Bio / Tools / Spidey / Results.pm
blob84dcc3f6d3911ce2d0e7b7c40b8ab95cac8e577a
2 # BioPerl module for Bio::Tools::Spidey::Results
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ryan Golhar <golharam@umdnj.edu>
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::Tools::Spidey::Results - Results of a Spidey run
16 =head1 SYNOPSIS
18 use Bio::Tools::Spidey::Results;
19 my $spidey = Bio::Tools::Spidey::Results->new(-file => 'result.spidey' );
21 # or
23 my $spidey = Bio::Tools::Spidey::Results->new( -fh => \*INPUT );
25 # get the exons before doing anything else
26 my $exonset = $spidey->next_exonset();
28 # parse the results
29 my @exons = $exonset->sub_SeqFeature();
30 print "Total no of Exons: ", scalar(@exons), "\n";
32 print "Genomic sequence length: ", $spidey->genomic_dna_length(), "\n";
34 # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Spidey::Exons
35 # as sub features
36 print "Delimited on sequence ", $exonset->seq_id(), " from ",
37 $exonset->start(), " to ", $exonset->end(), "\n";
39 foreach my $exon ( $exonset->sub_SeqFeature() ) {
40 # $exon is-a Bio::SeqFeature::FeaturePair
41 print "Exon from ", $exon->start, " to ", $exon->end,
42 " on strand ", $exon->strand(), "\n";
43 # you can get out what it matched using the est_hit attribute
44 my $homol = $exon->est_hit();
45 print "Matched to sequence ", $homol->seq_id,
46 " at ", $homol->start," to ", $homol->end, "\n";
49 # essential if you gave a filename at initialization (otherwise
50 # the file stays open)
51 $spidey->close();
53 =head1 DESCRIPTION
55 The spidey module provides a parser and results object for spidey
56 output. The spidey results are specialised types of SeqFeatures,
57 meaning you can add them to AnnSeq objects fine, and manipulate them
58 in the "normal" seqfeature manner.
60 The spidey Exon objects are Bio::SeqFeature::FeaturePair inherited
61 objects. The $esthit = $exon-E<gt>est_hit() is the alignment as a
62 feature on the matching object (normally, a cDNA), in which the
63 start/end points are where the hit lies.
65 To make this module work sensibly you need to run
67 spidey -i genomic.fasta -m cDNA.fasta
69 =head1 FEEDBACK
71 =head2 Mailing Lists
73 User feedback is an integral part of the evolution of this and other
74 Bioperl modules. Send your comments and suggestions preferably to one
75 of the Bioperl mailing lists. Your participation is much appreciated.
77 bioperl-l@bioperl.org - General discussion
78 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
80 =head2 Support
82 Please direct usage questions or support issues to the mailing list:
84 I<bioperl-l@bioperl.org>
86 rather than to the module maintainer directly. Many experienced and
87 reponsive experts will be able look at the problem and quickly
88 address it. Please include a thorough description of the problem
89 with code and data examples if at all possible.
91 =head2 Reporting Bugs
93 Report bugs to the Bioperl bug tracking system to help us keep track
94 the bugs and their resolution. Bug reports can be submitted via the
95 web:
97 https://github.com/bioperl/bioperl-live/issues
99 =head1 AUTHOR - Ryan Golhar
101 Email golharam@umdnj.edu
103 =head1 APPENDIX
105 The rest of the documentation details each of the object methods.
106 Internal methods are usually preceded with a _
108 =cut
111 # Let the code begin...
114 package Bio::Tools::Spidey::Results;
115 use strict;
117 use File::Basename;
118 use Bio::Root::Root;
119 use Bio::Tools::Spidey::Exon;
121 use base qw(Bio::Tools::AnalysisResult);
123 sub _initialize_state {
124 my($self,@args) = @_;
126 # call the inherited method first
127 my $make = $self->SUPER::_initialize_state(@args);
129 # my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args);
131 # delete($self->{'_est_is_first'});
132 # $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first));
133 $self->analysis_method("Spidey");
136 =head2 analysis_method
138 Usage : $spidey->analysis_method();
139 Purpose : Inherited method. Overridden to ensure that the name matches
140 /Spidey/i.
141 Returns : String
142 Argument : n/a
144 =cut
146 #-------------
147 sub analysis_method {
148 #-------------
149 my ($self, $method) = @_;
150 if($method && ($method !~ /Spidey/i)) {
151 $self->throw("method $method not supported in " . ref($self));
153 return $self->SUPER::analysis_method($method);
156 =head2 parse_next_alignment
158 Title : parse_next_alignment
159 Usage : @exons = $spidey_result->parse_next_alignment;
160 foreach $exon (@exons) {
161 # do something
163 Function: Parses the next alignment of the Spidey result file and returns the
164 found exons as an array of Bio::Tools::Spidey::Exon objects. Call
165 this method repeatedly until an empty array is returned to get the
166 results for all alignments.
167 Example :
168 Returns : An array of Bio::Tools::Spidey::Exon objects
169 Args :
172 =cut
174 sub parse_next_alignment {
175 my ($self) = @_;
176 # for strand 1 = plus, -1 = minus
177 my ($started,$version,$strand, $exoncount) = (0,0,0,-1);
178 my (%seq1props,%seq2props,@exons);
180 # we refer to the properties of each seq by reference
182 while(defined($_ = $self->_readline())) {
183 chomp;
185 # bascially, parse a Spidey result...
187 # matches: --SPIDEY version 1.40--
188 if( /^--SPIDEY\s+version\s+(\d+\.\d+)--/) {
189 if($started) {
190 $self->_pushback($_);
191 return \@exons;
193 $version = $1;
194 if ($version != 1.40) {
195 $self->throw("Spidey parser only designed to work with Spidey v1.40\n");
197 $started = 1;
198 } elsif (/^Genomic:\s+(\S+)\s.*,\s+(\d+)\sbp$/ ) {
199 # matches: Genomic: lcl|some_name other information, 1234 bp
200 # $seq1props{'filename'} = $1;
201 $seq1props{'seqname'} = $1;
202 $seq1props{'length'} = $2;
203 $self->genomic_dna_length($seq1props{'length'});
205 } elsif( /^mRNA:\s+(\S+)\s.*,(?:\s+mRNA\s+sequence,)?\s(\d+)\sbp$/ ) {
206 # matches: mRNA:
207 # $seq2props{'filename'} = $1;
208 $seq2props{'seqname'} = $1;
209 $seq2props{'length'} = $2;
211 } elsif( /^Strand:/ ) {
212 if (/plus/) {
213 $strand = 1;
214 } else {
215 $strand = -1;
217 } elsif( /^Number of exons: (\d+)/ ) {
218 $exoncount = $1;
220 my ($genomic_start, $genomic_stop, $cdna_start, $cdna_stop,
221 $id, $mismatches, $gaps, $splice_donor,
222 $splice_acceptor, $uncertain);
224 # the next $exoncount lines contains information
225 # about the matches of each exon. we should parse
226 # this information here
228 for (my $ec = 1; $ec <= $exoncount; $ec++) {
229 if (defined($_ = $self->_readline())) {
230 chomp;
232 if (/^Exon\s$ec[\(\)-]*:\s(\d+)-(\d+)\s\(gen\)\s+(\d+)-(\d+)\s\(mRNA\)\s+id\s([\d\.inf-]+)%\s+mismatches\s(\d+)\s+gaps\s(\d+)\s+splice\ssite\s\(d\s+a\):\s(\d+)\s+(\d+)\s*(\w*)/) {
233 $genomic_start = $1;
234 $genomic_stop = $2;
235 $cdna_start = $3;
236 $cdna_stop = $4;
237 $id = $5;
238 $mismatches = $6;
239 $gaps = $7;
240 $splice_donor = $8;
241 $splice_acceptor = $9;
242 $uncertain = $10;
243 } else {
244 $self->throw( "Failed to match anything:\n$_\n");
247 my $exon = Bio::Tools::Spidey::Exon->new
248 (-start => $genomic_start,
249 -end => $genomic_stop,
250 -strand => $strand);
251 $exon->seq_id($seq1props{'seqname'});
253 # feature1 is supposed to be initialized to a Similarity object, but we provide a safety net
254 if ($exon->feature1->can('seqlength')) {
255 $exon->feature1->seqlength($seq1props{'length'});
256 } else {
257 $exon->feature1->add_tag_value('seqlength', $seq1props{'length'});
260 # create and initialize the feature wrapping the 'hit' (the cDNA)
261 my $fea2 = Bio::SeqFeature::Similarity->new
262 (-start => $cdna_start,
263 -end => $cdna_stop,
264 -strand => $strand,
265 -seq_id => $seq2props{'seqname'},
266 -primary => "aligning_cDNA");
267 $fea2->seqlength($seq2props{'length'});
268 # store
269 $exon->est_hit($fea2);
271 # general properties
272 $exon->source_tag($self->analysis_method());
273 $exon->percentage_id($5);
274 $exon->mismatches($6);
275 $exon->gaps($7);
276 $exon->donor($8);
277 $exon->acceptor($9);
279 # push onto array
280 push(@exons, $exon);
281 } else {
282 $self->throw("Unexpected end of file reached\n");
285 } elsif( /^Number of splice sites:\s+(\d+)/ ) {
286 $self->splicesites($1);
287 } elsif( /^mRNA coverage:\s+(\d+)%/ ) {
288 $self->est_coverage($1);
289 } elsif(/^overall percent identity:\s+([\d\.]+)%/ ) {
290 $self->overall_percentage_id($1);
291 } elsif(/^Missing mRNA ends:\s+(\w+)/ ) {
292 $self->missing_mrna_ends($1);
293 } elsif( /^Exon (\d+): (\d+)-(\d+) \(gen\)\s+(\d+)-(\d+) \(mRNA\)/ ) {
294 my ($exon_num, $gen_start, $gen_stop, $cdna_start, $cdna_stop);
295 $exon_num = $1;
296 $gen_start = $2;
297 $gen_stop = $3;
298 $cdna_start = $4;
299 $cdna_stop = $5;
300 } elsif( /No alignment found/ ) {
301 return [];
302 } else {
303 #$self->debug("unmatched $_\n");
306 # Typical format:
307 # Exon 1: 36375798-36375691 (gen) 1-108 (mRNA)
310 # CCTCTTTTTCTTTGCAGGGTATATACCCAGTTACTTAGACAAGGATGAGCTATGTGTAGT
311 # | ||||||||||||||||||||||||||||||||||||||||||||||
312 # ATGTCAGGGTATATACCCAGTTACTTAGACAAGGATGAGCTATGTGTAGT
313 # M S G Y I P S Y L D K D E L C V V
316 # ATGTGGGGACAAAGCCACCGGATATCATTATCGCTGCATCACTTGTGAAGGTTGCAAGGT
317 # ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
318 # ATGTGGGGACAAAGCCACCGGATATCATTATCGCTGCATCACTTGTGAAGGTTGCAAG
319 # C G D K A T G Y H Y R C I T C E G C K
322 # AAATGGCA
324 @exons ? return \@exons : return ;
327 =head2 next_exonset
329 Title : next_exonset
330 Usage : $exonset = $spidey_result->parse_next_exonset;
331 print "Exons start at ", $exonset->start(),
332 "and end at ", $exonset->end(), "\n";
333 for $exon ($exonset->sub_SeqFeature()) {
334 # do something
336 Function: Parses the next alignment of the Spidey result file and returns the
337 set of exons as a container of features. The container is itself
338 a Bio::SeqFeature::Generic object, with the Bio::Tools::Spidey::Exon
339 objects as sub features. Start, end, and strand of the container
340 will represent the total region covered by the exons of this set.
342 See the documentation of parse_next_alignment() for further
343 reference about parsing and how the information is stored.
344 Example :
345 Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Spidey::Exon
346 objects as sub features.
347 Args :
349 =cut
351 sub next_exonset {
352 my $self = shift;
353 my $exonset;
355 # get the next array of exons
356 my $exons = $self->parse_next_alignment();
357 if( ! defined $exons ) {
358 $self->warn("No exons returned");
359 return;
361 if( @$exons == 0 ) {
362 return Bio::SeqFeature::Generic->new();
364 # create the container of exons as a feature object itself, with the
365 # data of the first exon for initialization
366 $exonset = Bio::SeqFeature::Generic->new('-start' => $exons->[0]->start(),
367 '-end' => $exons->[-1]->end(),
368 '-strand' => $exons->[0]->strand(),
369 '-primary' => "ExonSet");
370 $exonset->source_tag($exons->[0]->source_tag());
371 $exonset->seq_id($exons->[0]->seq_id());
372 # now add all exons as sub features, with enabling EXPANsion of the region
373 # covered in total
374 foreach my $exon (@$exons) {
375 $exonset->add_sub_SeqFeature($exon, 'EXPAND');
377 return $exonset;
380 =head2 next_feature
382 Title : next_feature
383 Usage : while($exonset = $spidey->next_feature()) {
384 # do something
386 Function: Does the same as L<next_exonset()>. See there for documentation of
387 the functionality. Call this method repeatedly until FALSE is
388 returned.
390 The returned object is actually a SeqFeatureI implementing object.
391 This method is required for classes implementing the
392 SeqAnalysisParserI interface, and is merely an alias for
393 next_exonset() at present.
395 Example :
396 Returns : A Bio::SeqFeature::Generic object.
397 Args :
399 =cut
401 sub next_feature {
402 my ($self,@args) = @_;
403 # even though next_exonset doesn't expect any args (and this method
404 # does neither), we pass on args in order to be prepared if this changes
405 # ever
406 return $self->next_exonset(@args);
409 =head2 genomic_dna_length
411 Title : genomic_dna_length
412 Usage : $spidey->genomic_dna_length();
413 Function: Returns the length of the genomic DNA used in this Spidey result
414 Example :
415 Returns : An integer value.
416 Args :
418 =cut
420 sub genomic_dna_length {
421 my ($self, @args) = @_;
422 my $val;
424 if(@args) {
425 $val = shift(@args);
426 $self->{'genomic_dna_length'} = $val;
427 } else {
428 $val = $self->{'genomic_dna_length'};
430 return $val;
433 =head2 splicesites
435 Title : splicesites
436 Usage : $spidey->splicesites();
437 Function: Returns the number of splice sites found in this Spidey result
438 Example :
439 Returns : An integer value.
440 Args :
442 =cut
444 sub splicesites {
445 my ($self, @args) = @_;
446 my $val;
448 if(@args) {
449 $val = shift(@args);
450 $self->{'splicesites'} = $val;
451 } else {
452 $val = $self->{'splicesites'};
454 return $val;
457 =head2 est_coverage
459 Title : est_coverage
460 Usage : $spidey->est_coverage();
461 Function: Returns the percent of est coverage in this Spidey result
462 Example :
463 Returns : An integer value.
464 Args :
466 =cut
468 sub est_coverage {
469 my ($self, @args) = @_;
470 my $val;
472 if(@args) {
473 $val = shift(@args);
474 $self->{'est_coverage'} = $val;
475 } else {
476 $val = $self->{'est_coverage'};
478 return $val;
481 =head2 overall_percentage_id
483 Title : overall_percentage_id
484 Usage : $spidey->overall_percentage_id();
485 Function: Returns the overall percent id in this Spidey result
486 Example :
487 Returns : An float value.
488 Args :
490 =cut
492 sub overall_percentage_id {
493 my ($self, @args) = @_;
494 my $val;
496 if(@args) {
497 $val = shift(@args);
498 $self->{'overall_percentage_id'} = $val;
499 } else {
500 $val = $self->{'overall_percentage_id'};
502 return $val;
505 =head2 missing_mrna_ends
507 Title : missing_mrna_ends
508 Usage : $spidey->missing_mrna_ends();
509 Function: Returns left/right/neither from Spidey
510 Example :
511 Returns : A string value.
512 Args :
514 =cut
516 sub missing_mrna_ends {
517 my ($self, @args) = @_;
518 my $val;
520 if(@args) {
521 $val = shift(@args);
522 $self->{'missing_mrna_ends'} = $val;
523 } else {
524 $val = $self->{'missing_mrna_ends'};
526 return $val;