2 # BioPerl module for Bio::Tools::ESTScan
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp <hlapp@gmx.net>
8 # Copyright Hilmar Lapp
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::ESTScan - Results of one ESTScan run
20 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan');
22 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT );
25 # note: this class is-a Bio::Tools::AnalysisResult which implements
26 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
27 while($gene = $estscan->next_prediction()) {
28 # $gene is an instance of Bio::Tools::Prediction::Gene
29 foreach my $orf ($gene->exons()) {
30 # $orf is an instance of Bio::Tools::Prediction::Exon
31 $cds_str = $orf->predicted_cds();
35 # essential if you gave a filename at initialization (otherwise the file
41 The ESTScan module provides a parser for ESTScan coding region prediction
44 This module inherits off L<Bio::Tools::AnalysisResult> and therefore
45 implements the L<Bio::SeqAnalysisParserI> interface.
46 See L<Bio::SeqAnalysisParserI>.
52 User feedback is an integral part of the evolution of this and other
53 Bioperl modules. Send your comments and suggestions preferably to one
54 of the Bioperl mailing lists. Your participation is much appreciated.
56 bioperl-l@bioperl.org - General discussion
57 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61 Please direct usage questions or support issues to the mailing list:
63 I<bioperl-l@bioperl.org>
65 rather than to the module maintainer directly. Many experienced and
66 reponsive experts will be able look at the problem and quickly
67 address it. Please include a thorough description of the problem
68 with code and data examples if at all possible.
72 Report bugs to the Bioperl bug tracking system to help us keep track
73 the bugs and their resolution. Bug reports can be submitted via the
76 https://github.com/bioperl/bioperl-live/issues
78 =head1 AUTHOR - Hilmar Lapp
80 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
84 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
88 # Let the code begin...
90 package Bio
::Tools
::ESTScan
;
95 use Bio
::Tools
::Prediction
::Exon
;
97 use base
qw(Bio::Tools::AnalysisResult);
99 sub _initialize_state
{
100 my ($self,@args) = @_;
102 # first call the inherited method!
103 my $make = $self->SUPER::_initialize_state
(@args);
105 if(! $self->analysis_method()) {
106 $self->analysis_method('ESTScan');
110 =head2 analysis_method
112 Usage : $estscan->analysis_method();
113 Purpose : Inherited method. Overridden to ensure that the name matches
121 sub analysis_method
{
123 my ($self, $method) = @_;
124 if($method && ($method !~ /estscan/i)) {
125 $self->throw("method $method not supported in " . ref($self));
127 return $self->SUPER::analysis_method
($method);
133 Usage : while($orf = $estscan->next_feature()) {
136 Function: Returns the next gene structure prediction of the ESTScan result
137 file. Call this method repeatedly until FALSE is returned.
139 The returned object is actually a SeqFeatureI implementing object.
140 This method is required for classes implementing the
141 SeqAnalysisParserI interface, and is merely an alias for
142 next_prediction() at present.
145 Returns : A Bio::Tools::Prediction::Gene object.
151 my ($self,@args) = @_;
152 # even though next_prediction doesn't expect any args (and this method
153 # does neither), we pass on args in order to be prepared if this changes
155 return $self->next_prediction(@args);
158 =head2 next_prediction
160 Title : next_prediction
161 Usage : while($gene = $estscan->next_prediction()) {
164 Function: Returns the next gene structure prediction of the ESTScan result
165 file. Call this method repeatedly until FALSE is returned.
167 So far, this method DOES NOT work for reverse strand predictions,
168 even though the code looks like.
170 Returns : A Bio::Tools::Prediction::Gene object.
175 sub next_prediction
{
177 my ($gene, $seq, $cds, $predobj);
180 # predictions are in the format of FASTA sequences and can be parsed one
182 $seq = $self->_fasta_stream()->next_seq();
184 # there is a new prediction
185 $gene = Bio
::Tools
::Prediction
::Gene
->new('-primary' => "ORFprediction",
186 '-source' => "ESTScan");
187 # score starts the description
188 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or
189 $self->throw("unexpected format of description: no score in " .
193 # strand may end the description
194 if($seq->desc() =~ /(.*)minus strand$/) {
202 # check for the format: default or 'all-in-one' (option -a)
203 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) {
206 $predobj = Bio
::Tools
::Prediction
::Exon
->new('-source' => "ESTScan",
209 $predobj->strand($gene->strand());
210 $predobj->score($gene->score()); # FIXME or $1, or $2 ?
211 $predobj->primary_tag("InternalExon");
212 $predobj->seq_id($seq->display_id());
213 # add to gene structure object
214 $gene->add_exon($predobj);
217 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
218 $cds = Bio
::PrimarySeq
->new('-seq' => $cds,
219 '-display_id' => $seq->display_id(),
220 '-desc' => $seq->desc(),
221 '-alphabet' => "dna");
222 $gene->predicted_cds($cds);
223 $predobj->predicted_cds($cds);
224 if($gene->strand() == -1) {
225 $self->warn("reverse strand ORF, but unable to reverse coordinates!");
229 # All-in-one format (hopefully). This encodes the following information
231 # 1) untranslated regions: stretches of lower-case letters
232 # 2) translated regions: stretches of upper-case letters
233 # 3) insertions in the translated regions: capital X
234 # 4) deletions in the translated regions: a single lower-case letter
236 # if reverse strand ORF, save a lot of hassle by reversing the sequence
237 if($gene->strand() == -1) {
238 $seq = $seq->revcom();
240 my $seqstr = $seq->seq();
241 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) {
246 # strip 3'UTR and following exons
247 if($exonseq =~ s/([a-z]{2,}.*)$//) {
252 # start: take care of yielding the absolute coordinate
253 my $start = CORE
::length($utr5) + 1;
255 $start += $predobj->end() + $numins;
257 # for the end coordinate, we need to subtract the insertions
260 my $end = $start + CORE
::length($cds) - 1;
261 # construct next exon object
262 $predobj = Bio
::Tools
::Prediction
::Exon
->new('-start' => $start,
264 $predobj->source_tag("ESTScan");
265 $predobj->primary_tag("InternalExon");
266 $predobj->seq_id($seq->display_id());
267 $predobj->strand($gene->strand());
268 $predobj->score($gene->score());
269 # add the exon to the gene structure object
270 $gene->add_exon($predobj);
271 # add the predicted CDS
273 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
274 $cds = Bio
::PrimarySeq
->new('-seq' => $cds,
275 '-display_id' => $seq->display_id(),
276 '-desc' => $seq->desc(),
277 '-alphabet' => "dna");
278 # only store the first one in the overall prediction
279 $gene->predicted_cds($cds) unless $gene->predicted_cds();
280 $predobj->predicted_cds($cds);
281 # add the predicted insertions and deletions as subfeatures
284 while($exonseq =~ /([a-zX])/g) {
286 # start and end: start looking at the position after the
289 $start = $fea->start()+$numins;
290 $start -= 1 if($fea->primary_tag() eq 'insertion');
292 $start = $predobj->start()+$numins-1;
294 #print "# numins = $numins, indel = $indel, start = $start\n";
295 $start = index($seq->seq(), $indel, $start) + 1 - $numins;
296 $fea = Bio
::SeqFeature
::Generic
->new('-start' => $start,
298 $fea->source_tag("ESTScan");
299 $fea->seq_id($seq->display_id());
300 $fea->strand($predobj->strand());
302 # an insertion (depends on viewpoint: to get the 'real'
303 # CDS, a base has to be inserted, i.e., the HMMER model
304 # inserted a base; however, the sequencing process deleted
305 # a base that was there).
306 $fea->primary_tag("insertion");
307 # we need to count insertions because these are left out
308 # of any coordinates saved in the objects (which is correct
309 # because insertions change the original sequence, so
310 # coordinates wouldn't match)
313 # a deletion (depends on viewpoint: to get the 'real'
314 # CDS, a base has to be deleted, i.e., the HMMER model
315 # deleted a base; however, the sequencing process inserted
316 # a base that wasn't there).
317 $fea->primary_tag("deletion");
318 $fea->add_tag_value('base', $indel);
320 $predobj->add_sub_SeqFeature($fea);
331 Usage : $result->close()
332 Function: Closes the file handle associated with this result file.
333 Inherited method, overridden.
341 my ($self, @args) = @_;
343 delete($self->{'_fastastream'});
344 $self->SUPER::close(@args);
349 Title : _fasta_stream
350 Usage : $result->_fasta_stream()
351 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of
352 the file associated with this MZEF result object.
354 If called for the first time, creates the stream from the filehandle
363 my ($self, $stream) = @_;
365 if($stream || (! exists($self->{'_fastastream'}))) {
367 $stream = Bio
::SeqIO
->new('-fh' => $self->_fh(),
368 '-format' => "fasta");
370 $self->{'_fastastream'} = $stream;
372 return $self->{'_fastastream'};