2 # BioPerl module for Bio::Tools::ESTScan
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp <hlapp@gmx.net>
8 # Copyright Hilmar Lapp
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::ESTScan - Results of one ESTScan run
20 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan');
22 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT );
25 # note: this class is-a Bio::Tools::AnalysisResult which implements
26 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
27 while($gene = $estscan->next_prediction()) {
28 # $gene is an instance of Bio::Tools::Prediction::Gene
29 foreach my $orf ($gene->exons()) {
30 # $orf is an instance of Bio::Tools::Prediction::Exon
31 $cds_str = $orf->predicted_cds();
35 # essential if you gave a filename at initialization (otherwise the file
41 The ESTScan module provides a parser for ESTScan coding region prediction
44 This module inherits off L<Bio::Tools::AnalysisResult> and therefore
45 implements the L<Bio::SeqAnalysisParserI> interface.
46 See L<Bio::SeqAnalysisParserI>.
52 User feedback is an integral part of the evolution of this and other
53 Bioperl modules. Send your comments and suggestions preferably to one
54 of the Bioperl mailing lists. Your participation is much appreciated.
56 bioperl-l@bioperl.org - General discussion
57 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61 Please direct usage questions or support issues to the mailing list:
63 I<bioperl-l@bioperl.org>
65 rather than to the module maintainer directly. Many experienced and
66 reponsive experts will be able look at the problem and quickly
67 address it. Please include a thorough description of the problem
68 with code and data examples if at all possible.
72 Report bugs to the Bioperl bug tracking system to help us keep track
73 the bugs and their resolution. Bug reports can be submitted via the
76 https://github.com/bioperl/bioperl-live/issues
78 =head1 AUTHOR - Hilmar Lapp
80 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
84 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
88 # Let the code begin...
90 package Bio
::Tools
::ESTScan
;
96 use Bio
::Tools
::Prediction
::Exon
;
98 use base
qw(Bio::Tools::AnalysisResult);
100 sub _initialize_state
{
101 my ($self,@args) = @_;
103 # first call the inherited method!
104 my $make = $self->SUPER::_initialize_state
(@args);
106 if(! $self->analysis_method()) {
107 $self->analysis_method('ESTScan');
111 =head2 analysis_method
113 Usage : $estscan->analysis_method();
114 Purpose : Inherited method. Overridden to ensure that the name matches
122 sub analysis_method
{
124 my ($self, $method) = @_;
125 if($method && ($method !~ /estscan/i)) {
126 $self->throw("method $method not supported in " . ref($self));
128 return $self->SUPER::analysis_method
($method);
134 Usage : while($orf = $estscan->next_feature()) {
137 Function: Returns the next gene structure prediction of the ESTScan result
138 file. Call this method repeatedly until FALSE is returned.
140 The returned object is actually a SeqFeatureI implementing object.
141 This method is required for classes implementing the
142 SeqAnalysisParserI interface, and is merely an alias for
143 next_prediction() at present.
146 Returns : A Bio::Tools::Prediction::Gene object.
152 my ($self,@args) = @_;
153 # even though next_prediction doesn't expect any args (and this method
154 # does neither), we pass on args in order to be prepared if this changes
156 return $self->next_prediction(@args);
159 =head2 next_prediction
161 Title : next_prediction
162 Usage : while($gene = $estscan->next_prediction()) {
165 Function: Returns the next gene structure prediction of the ESTScan result
166 file. Call this method repeatedly until FALSE is returned.
168 So far, this method DOES NOT work for reverse strand predictions,
169 even though the code looks like.
171 Returns : A Bio::Tools::Prediction::Gene object.
176 sub next_prediction
{
178 my ($gene, $seq, $cds, $predobj);
181 # predictions are in the format of FASTA sequences and can be parsed one
183 $seq = $self->_fasta_stream()->next_seq();
185 # there is a new prediction
186 $gene = Bio
::Tools
::Prediction
::Gene
->new('-primary' => "ORFprediction",
187 '-source' => "ESTScan");
188 # score starts the description
189 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or
190 $self->throw("unexpected format of description: no score in " .
194 # strand may end the description
195 if($seq->desc() =~ /(.*)minus strand$/) {
203 # check for the format: default or 'all-in-one' (option -a)
204 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) {
207 $predobj = Bio
::Tools
::Prediction
::Exon
->new('-source' => "ESTScan",
210 $predobj->strand($gene->strand());
211 $predobj->score($gene->score()); # FIXME or $1, or $2 ?
212 $predobj->primary_tag("InternalExon");
213 $predobj->seq_id($seq->display_id());
214 # add to gene structure object
215 $gene->add_exon($predobj);
218 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
219 $cds = Bio
::PrimarySeq
->new('-seq' => $cds,
220 '-display_id' => $seq->display_id(),
221 '-desc' => $seq->desc(),
222 '-alphabet' => "dna");
223 $gene->predicted_cds($cds);
224 $predobj->predicted_cds($cds);
225 if($gene->strand() == -1) {
226 $self->warn("reverse strand ORF, but unable to reverse coordinates!");
230 # All-in-one format (hopefully). This encodes the following information
232 # 1) untranslated regions: stretches of lower-case letters
233 # 2) translated regions: stretches of upper-case letters
234 # 3) insertions in the translated regions: capital X
235 # 4) deletions in the translated regions: a single lower-case letter
237 # if reverse strand ORF, save a lot of hassle by reversing the sequence
238 if($gene->strand() == -1) {
239 $seq = $seq->revcom();
241 my $seqstr = $seq->seq();
242 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) {
247 # strip 3'UTR and following exons
248 if($exonseq =~ s/([a-z]{2,}.*)$//) {
253 # start: take care of yielding the absolute coordinate
254 my $start = CORE
::length($utr5) + 1;
256 $start += $predobj->end() + $numins;
258 # for the end coordinate, we need to subtract the insertions
261 my $end = $start + CORE
::length($cds) - 1;
262 # construct next exon object
263 $predobj = Bio
::Tools
::Prediction
::Exon
->new('-start' => $start,
265 $predobj->source_tag("ESTScan");
266 $predobj->primary_tag("InternalExon");
267 $predobj->seq_id($seq->display_id());
268 $predobj->strand($gene->strand());
269 $predobj->score($gene->score());
270 # add the exon to the gene structure object
271 $gene->add_exon($predobj);
272 # add the predicted CDS
274 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
275 $cds = Bio
::PrimarySeq
->new('-seq' => $cds,
276 '-display_id' => $seq->display_id(),
277 '-desc' => $seq->desc(),
278 '-alphabet' => "dna");
279 # only store the first one in the overall prediction
280 $gene->predicted_cds($cds) unless $gene->predicted_cds();
281 $predobj->predicted_cds($cds);
282 # add the predicted insertions and deletions as subfeatures
285 while($exonseq =~ /([a-zX])/g) {
287 # start and end: start looking at the position after the
290 $start = $fea->start()+$numins;
291 $start -= 1 if($fea->primary_tag() eq 'insertion');
293 $start = $predobj->start()+$numins-1;
295 #print "# numins = $numins, indel = $indel, start = $start\n";
296 $start = index($seq->seq(), $indel, $start) + 1 - $numins;
297 $fea = Bio
::SeqFeature
::Generic
->new('-start' => $start,
299 $fea->source_tag("ESTScan");
300 $fea->seq_id($seq->display_id());
301 $fea->strand($predobj->strand());
303 # an insertion (depends on viewpoint: to get the 'real'
304 # CDS, a base has to be inserted, i.e., the HMMER model
305 # inserted a base; however, the sequencing process deleted
306 # a base that was there).
307 $fea->primary_tag("insertion");
308 # we need to count insertions because these are left out
309 # of any coordinates saved in the objects (which is correct
310 # because insertions change the original sequence, so
311 # coordinates wouldn't match)
314 # a deletion (depends on viewpoint: to get the 'real'
315 # CDS, a base has to be deleted, i.e., the HMMER model
316 # deleted a base; however, the sequencing process inserted
317 # a base that wasn't there).
318 $fea->primary_tag("deletion");
319 $fea->add_tag_value('base', $indel);
321 $predobj->add_sub_SeqFeature($fea);
332 Usage : $result->close()
333 Function: Closes the file handle associated with this result file.
334 Inherited method, overridden.
342 my ($self, @args) = @_;
344 delete($self->{'_fastastream'});
345 $self->SUPER::close(@args);
350 Title : _fasta_stream
351 Usage : $result->_fasta_stream()
352 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of
353 the file associated with this MZEF result object.
355 If called for the first time, creates the stream from the filehandle
364 my ($self, $stream) = @_;
366 if($stream || (! exists($self->{'_fastastream'}))) {
368 $stream = Bio
::SeqIO
->new('-fh' => $self->_fh(),
369 '-format' => "fasta");
371 $self->{'_fastastream'} = $stream;
373 return $self->{'_fastastream'};