Move HMMER related modules, tests, and programs to new distribution.
[bioperl-live.git] / Bio / Tools / ESTScan.pm
blob944f797ffaadd57039f33b7202250e1e71b1510e
2 # BioPerl module for Bio::Tools::ESTScan
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp <hlapp@gmx.net>
8 # Copyright Hilmar Lapp
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::ESTScan - Results of one ESTScan run
18 =head1 SYNOPSIS
20 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan');
21 # filehandle:
22 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT );
24 # parse the results
25 # note: this class is-a Bio::Tools::AnalysisResult which implements
26 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
27 while($gene = $estscan->next_prediction()) {
28 # $gene is an instance of Bio::Tools::Prediction::Gene
29 foreach my $orf ($gene->exons()) {
30 # $orf is an instance of Bio::Tools::Prediction::Exon
31 $cds_str = $orf->predicted_cds();
35 # essential if you gave a filename at initialization (otherwise the file
36 # will stay open)
37 $estscan->close();
39 =head1 DESCRIPTION
41 The ESTScan module provides a parser for ESTScan coding region prediction
42 output.
44 This module inherits off L<Bio::Tools::AnalysisResult> and therefore
45 implements the L<Bio::SeqAnalysisParserI> interface.
46 See L<Bio::SeqAnalysisParserI>.
48 =head1 FEEDBACK
50 =head2 Mailing Lists
52 User feedback is an integral part of the evolution of this and other
53 Bioperl modules. Send your comments and suggestions preferably to one
54 of the Bioperl mailing lists. Your participation is much appreciated.
56 bioperl-l@bioperl.org - General discussion
57 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
59 =head2 Support
61 Please direct usage questions or support issues to the mailing list:
63 I<bioperl-l@bioperl.org>
65 rather than to the module maintainer directly. Many experienced and
66 reponsive experts will be able look at the problem and quickly
67 address it. Please include a thorough description of the problem
68 with code and data examples if at all possible.
70 =head2 Reporting Bugs
72 Report bugs to the Bioperl bug tracking system to help us keep track
73 the bugs and their resolution. Bug reports can be submitted via the
74 web:
76 https://github.com/bioperl/bioperl-live/issues
78 =head1 AUTHOR - Hilmar Lapp
80 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
82 =head1 APPENDIX
84 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
86 =cut
88 # Let the code begin...
90 package Bio::Tools::ESTScan;
91 use strict;
92 use Symbol;
94 use Bio::Root::Root;
95 use Bio::Tools::Prediction::Exon;
97 use base qw(Bio::Tools::AnalysisResult);
99 sub _initialize_state {
100 my ($self,@args) = @_;
102 # first call the inherited method!
103 my $make = $self->SUPER::_initialize_state(@args);
105 if(! $self->analysis_method()) {
106 $self->analysis_method('ESTScan');
110 =head2 analysis_method
112 Usage : $estscan->analysis_method();
113 Purpose : Inherited method. Overridden to ensure that the name matches
114 /estscan/i.
115 Returns : String
116 Argument : n/a
118 =cut
120 #-------------
121 sub analysis_method {
122 #-------------
123 my ($self, $method) = @_;
124 if($method && ($method !~ /estscan/i)) {
125 $self->throw("method $method not supported in " . ref($self));
127 return $self->SUPER::analysis_method($method);
130 =head2 next_feature
132 Title : next_feature
133 Usage : while($orf = $estscan->next_feature()) {
134 # do something
136 Function: Returns the next gene structure prediction of the ESTScan result
137 file. Call this method repeatedly until FALSE is returned.
139 The returned object is actually a SeqFeatureI implementing object.
140 This method is required for classes implementing the
141 SeqAnalysisParserI interface, and is merely an alias for
142 next_prediction() at present.
144 Example :
145 Returns : A Bio::Tools::Prediction::Gene object.
146 Args :
148 =cut
150 sub next_feature {
151 my ($self,@args) = @_;
152 # even though next_prediction doesn't expect any args (and this method
153 # does neither), we pass on args in order to be prepared if this changes
154 # ever
155 return $self->next_prediction(@args);
158 =head2 next_prediction
160 Title : next_prediction
161 Usage : while($gene = $estscan->next_prediction()) {
162 # do something
164 Function: Returns the next gene structure prediction of the ESTScan result
165 file. Call this method repeatedly until FALSE is returned.
167 So far, this method DOES NOT work for reverse strand predictions,
168 even though the code looks like.
169 Example :
170 Returns : A Bio::Tools::Prediction::Gene object.
171 Args :
173 =cut
175 sub next_prediction {
176 my ($self) = @_;
177 my ($gene, $seq, $cds, $predobj);
178 my $numins = 0;
180 # predictions are in the format of FASTA sequences and can be parsed one
181 # at a time
182 $seq = $self->_fasta_stream()->next_seq();
183 return unless $seq;
184 # there is a new prediction
185 $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction",
186 '-source' => "ESTScan");
187 # score starts the description
188 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or
189 $self->throw("unexpected format of description: no score in " .
190 $seq->desc());
191 $gene->score($1);
192 $seq->desc($2);
193 # strand may end the description
194 if($seq->desc() =~ /(.*)minus strand$/) {
195 my $desc = $1;
196 $desc =~ s/;\s+$//;
197 $seq->desc($desc);
198 $gene->strand(-1);
199 } else {
200 $gene->strand(1);
202 # check for the format: default or 'all-in-one' (option -a)
203 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) {
204 # default format
205 $seq->desc($5);
206 $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan",
207 '-start' => $3,
208 '-end' => $4);
209 $predobj->strand($gene->strand());
210 $predobj->score($gene->score()); # FIXME or $1, or $2 ?
211 $predobj->primary_tag("InternalExon");
212 $predobj->seq_id($seq->display_id());
213 # add to gene structure object
214 $gene->add_exon($predobj);
215 # add predicted CDS
216 $cds = $seq->seq();
217 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
218 $cds = Bio::PrimarySeq->new('-seq' => $cds,
219 '-display_id' => $seq->display_id(),
220 '-desc' => $seq->desc(),
221 '-alphabet' => "dna");
222 $gene->predicted_cds($cds);
223 $predobj->predicted_cds($cds);
224 if($gene->strand() == -1) {
225 $self->warn("reverse strand ORF, but unable to reverse coordinates!");
227 } else {
229 # All-in-one format (hopefully). This encodes the following information
230 # into the sequence:
231 # 1) untranslated regions: stretches of lower-case letters
232 # 2) translated regions: stretches of upper-case letters
233 # 3) insertions in the translated regions: capital X
234 # 4) deletions in the translated regions: a single lower-case letter
236 # if reverse strand ORF, save a lot of hassle by reversing the sequence
237 if($gene->strand() == -1) {
238 $seq = $seq->revcom();
240 my $seqstr = $seq->seq();
241 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) {
242 # leading 5'UTR
243 my $utr5 = $1;
244 # exon + 3'UTR
245 my $exonseq = $2;
246 # strip 3'UTR and following exons
247 if($exonseq =~ s/([a-z]{2,}.*)$//) {
248 $seqstr = $1;
249 } else {
250 $seqstr = "";
252 # start: take care of yielding the absolute coordinate
253 my $start = CORE::length($utr5) + 1;
254 if($predobj) {
255 $start += $predobj->end() + $numins;
257 # for the end coordinate, we need to subtract the insertions
258 $cds = $exonseq;
259 $cds =~ s/[X]//g;
260 my $end = $start + CORE::length($cds) - 1;
261 # construct next exon object
262 $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start,
263 '-end' => $end);
264 $predobj->source_tag("ESTScan");
265 $predobj->primary_tag("InternalExon");
266 $predobj->seq_id($seq->display_id());
267 $predobj->strand($gene->strand());
268 $predobj->score($gene->score());
269 # add the exon to the gene structure object
270 $gene->add_exon($predobj);
271 # add the predicted CDS
272 $cds = $exonseq;
273 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
274 $cds = Bio::PrimarySeq->new('-seq' => $cds,
275 '-display_id' => $seq->display_id(),
276 '-desc' => $seq->desc(),
277 '-alphabet' => "dna");
278 # only store the first one in the overall prediction
279 $gene->predicted_cds($cds) unless $gene->predicted_cds();
280 $predobj->predicted_cds($cds);
281 # add the predicted insertions and deletions as subfeatures
282 # of the exon
283 my $fea = undef;
284 while($exonseq =~ /([a-zX])/g) {
285 my $indel = $1;
286 # start and end: start looking at the position after the
287 # previous feature
288 if($fea) {
289 $start = $fea->start()+$numins;
290 $start -= 1 if($fea->primary_tag() eq 'insertion');
291 } else {
292 $start = $predobj->start()+$numins-1;
294 #print "# numins = $numins, indel = $indel, start = $start\n";
295 $start = index($seq->seq(), $indel, $start) + 1 - $numins;
296 $fea = Bio::SeqFeature::Generic->new('-start' => $start,
297 '-end' => $start);
298 $fea->source_tag("ESTScan");
299 $fea->seq_id($seq->display_id());
300 $fea->strand($predobj->strand());
301 if($indel eq 'X') {
302 # an insertion (depends on viewpoint: to get the 'real'
303 # CDS, a base has to be inserted, i.e., the HMMER model
304 # inserted a base; however, the sequencing process deleted
305 # a base that was there).
306 $fea->primary_tag("insertion");
307 # we need to count insertions because these are left out
308 # of any coordinates saved in the objects (which is correct
309 # because insertions change the original sequence, so
310 # coordinates wouldn't match)
311 $numins++;
312 } else {
313 # a deletion (depends on viewpoint: to get the 'real'
314 # CDS, a base has to be deleted, i.e., the HMMER model
315 # deleted a base; however, the sequencing process inserted
316 # a base that wasn't there).
317 $fea->primary_tag("deletion");
318 $fea->add_tag_value('base', $indel);
320 $predobj->add_sub_SeqFeature($fea);
325 return $gene;
328 =head2 close
330 Title : close
331 Usage : $result->close()
332 Function: Closes the file handle associated with this result file.
333 Inherited method, overridden.
334 Example :
335 Returns :
336 Args :
338 =cut
340 sub close {
341 my ($self, @args) = @_;
343 delete($self->{'_fastastream'});
344 $self->SUPER::close(@args);
347 =head2 _fasta_stream
349 Title : _fasta_stream
350 Usage : $result->_fasta_stream()
351 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of
352 the file associated with this MZEF result object.
354 If called for the first time, creates the stream from the filehandle
355 if necessary.
356 Example :
357 Returns :
358 Args :
360 =cut
362 sub _fasta_stream {
363 my ($self, $stream) = @_;
365 if($stream || (! exists($self->{'_fastastream'}))) {
366 if(! $stream) {
367 $stream = Bio::SeqIO->new('-fh' => $self->_fh(),
368 '-format' => "fasta");
370 $self->{'_fastastream'} = $stream;
372 return $self->{'_fastastream'};