2 # BioPerl module for Bio::Tools::Genscan
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp <hlapp@gmx.net>
8 # Copyright Hilmar Lapp
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::Genscan - Results of one Genscan run
20 use Bio::Tools::Genscan;
22 $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan');
24 $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT );
27 # note: this class is-a Bio::Tools::AnalysisResult which implements
28 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
29 while($gene = $genscan->next_prediction()) {
30 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
31 # off Bio::SeqFeature::Gene::Transcript.
33 # $gene->exons() returns an array of
34 # Bio::Tools::Prediction::Exon objects
36 @exon_arr = $gene->exons();
39 @init_exons = $gene->exons('Initial');
41 @intrl_exons = $gene->exons('Internal');
43 @term_exons = $gene->exons('Terminal');
45 ($single_exon) = $gene->exons();
48 # essential if you gave a filename at initialization (otherwise the file
54 The Genscan module provides a parser for Genscan gene structure prediction
55 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
58 This module also implements the Bio::SeqAnalysisParserI interface, and thus
59 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
65 User feedback is an integral part of the evolution of this and other
66 Bioperl modules. Send your comments and suggestions preferably to one
67 of the Bioperl mailing lists. Your participation is much appreciated.
69 bioperl-l@bioperl.org - General discussion
70 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74 Please direct usage questions or support issues to the mailing list:
76 I<bioperl-l@bioperl.org>
78 rather than to the module maintainer directly. Many experienced and
79 reponsive experts will be able look at the problem and quickly
80 address it. Please include a thorough description of the problem
81 with code and data examples if at all possible.
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 the bugs and their resolution. Bug reports can be submitted via the
89 https://github.com/bioperl/bioperl-live/issues
91 =head1 AUTHOR - Hilmar Lapp
97 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
102 # Let the code begin...
105 package Bio
::Tools
::Genscan
;
110 use Bio
::Tools
::Prediction
::Gene
;
111 use Bio
::Tools
::Prediction
::Exon
;
113 use base
qw(Bio::Tools::AnalysisResult);
115 my %ExonTags = ('Init' => 'Initial',
116 'Intr' => 'Internal',
117 'Term' => 'Terminal',
120 sub _initialize_state
{
121 my ($self,@args) = @_;
123 # first call the inherited method!
124 $self->SUPER::_initialize_state
(@args);
126 # our private state variables
127 $self->{'_preds_parsed'} = 0;
128 $self->{'_has_cds'} = 0;
129 # array of pre-parsed predictions
130 $self->{'_preds'} = [];
132 $self->{'_seqstack'} = [];
135 =head2 analysis_method
137 Usage : $genscan->analysis_method();
138 Purpose : Inherited method. Overridden to ensure that the name matches
146 sub analysis_method
{
148 my ($self, $method) = @_;
149 if($method && ($method !~ /genscan/i)) {
150 $self->throw("method $method not supported in " . ref($self));
152 return $self->SUPER::analysis_method
($method);
158 Usage : while($gene = $genscan->next_feature()) {
161 Function: Returns the next gene structure prediction of the Genscan result
162 file. Call this method repeatedly until FALSE is returned.
164 The returned object is actually a SeqFeatureI implementing object.
165 This method is required for classes implementing the
166 SeqAnalysisParserI interface, and is merely an alias for
167 next_prediction() at present.
170 Returns : A Bio::Tools::Prediction::Gene object.
176 my ($self,@args) = @_;
177 # even though next_prediction doesn't expect any args (and this method
178 # does neither), we pass on args in order to be prepared if this changes
180 return $self->next_prediction(@args);
183 =head2 next_prediction
185 Title : next_prediction
186 Usage : while($gene = $genscan->next_prediction()) {
189 Function: Returns the next gene structure prediction of the Genscan result
190 file. Call this method repeatedly until FALSE is returned.
193 Returns : A Bio::Tools::Prediction::Gene object.
198 sub next_prediction
{
202 # if the prediction section hasn't been parsed yet, we do this now
203 $self->_parse_predictions() unless $self->_predictions_parsed();
205 # get next gene structure
206 $gene = $self->_prediction();
209 # fill in predicted protein, and if available the predicted CDS
212 # use the seq stack if there's a seq on it
213 my $seqobj = pop(@
{$self->{'_seqstack'}});
215 # otherwise read from input stream
216 ($id, $seq) = $self->_read_fasta_seq();
217 # there may be no sequence at all, or none any more
219 $seqobj = Bio
::PrimarySeq
->new('-seq' => $seq,
220 '-display_id' => $id,
221 '-alphabet' => "protein");
225 # check that prediction number matches the prediction number
226 # indicated in the sequence id (there may be incomplete gene
227 # predictions that contain only signals with no associated protein
228 # and CDS, like promoters, poly-A sites etc)
229 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
231 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) {
232 # this is not our sequence, so push back for next prediction
233 push(@
{$self->{'_seqstack'}}, $seqobj);
235 $gene->predicted_protein($seqobj);
236 # CDS prediction, too?
237 if($self->_has_cds()) {
238 ($id, $seq) = $self->_read_fasta_seq();
239 $seqobj = Bio
::PrimarySeq
->new('-seq' => $seq,
240 '-display_id' => $id,
241 '-alphabet' => "dna");
242 $gene->predicted_cds($seqobj);
251 =head2 _parse_predictions
253 Title : _parse_predictions()
254 Usage : $obj->_parse_predictions()
255 Function: Parses the prediction section. Automatically called by
256 next_prediction() if not yet done.
262 sub _parse_predictions
{
267 while(defined($_ = $self->_readline())) {
268 if(/^\s*(\d+)\.(\d+)/) {
271 my $signalnr = $2; # not used presently
272 if(! defined($gene)) {
273 $gene = Bio
::Tools
::Prediction
::Gene
->new(
274 '-primary' => "GenePrediction$prednr",
275 '-source' => 'Genscan');
279 my @flds = split(' ', $_);
280 # create the feature object depending on the type of signal
282 my $is_exon = grep {$_ eq $flds[1];} (keys(%ExonTags));
284 $predobj = Bio
::Tools
::Prediction
::Exon
->new();
286 # PolyA site, or Promoter
287 $predobj = Bio
::SeqFeature
::Generic
->new();
290 $predobj->source_tag('Genscan');
291 $predobj->score($flds[$#flds]);
292 $predobj->strand((($flds[2] eq '+') ?
1 : -1));
293 my ($start, $end) = @flds[(3,4)];
294 if($predobj->strand() == 1) {
295 $predobj->start($start);
298 $predobj->end($start);
299 $predobj->start($end);
301 # add to gene structure (should be done only when start and end
302 # are set, in order to allow for proper expansion of the range)
304 # first, set fields unique to exons
305 $predobj->start_signal_score($flds[8]);
306 $predobj->end_signal_score($flds[9]);
307 $predobj->coding_signal_score($flds[10]);
308 $predobj->significance($flds[11]);
309 $predobj->primary_tag($ExonTags{$flds[1]} . 'Exon');
310 $predobj->is_coding(1);
311 # Figure out the frame of this exon. This is NOT the frame
312 # given by Genscan, which is the absolute frame of the base
313 # starting the first predicted complete codon. By comparing
314 # to the absolute frame of the first base we can compute the
315 # offset of the first complete codon to the first base of the
316 # exon, which determines the frame of the exon.
318 if($predobj->strand() == 1) {
319 $cod_offset = $flds[6] - (($predobj->start()-1) % 3);
320 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
321 # to offsets 2 and 1, resp. Offset 3 is the same as 0.
322 $cod_offset += 3 if($cod_offset < 1);
324 # On the reverse strand the Genscan frame also refers to
325 # the first base of the first complete codon, but viewed
326 # from forward, which is the third base viewed from
328 $cod_offset = $flds[6] - (($predobj->end()-3) % 3);
329 # Possible values are -2, -1, 0, 1, 2. Due to the reverse
330 # situation, {2,-1} and {1,-2} correspond to offsets
331 # 1 and 2, resp. Offset 3 is the same as 0.
332 $cod_offset -= 3 if($cod_offset >= 0);
333 $cod_offset = -$cod_offset;
335 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
336 # is the frame of the first base relative to the exon, or the
337 # number of bases the first codon is missing).
338 $predobj->frame(3 - $cod_offset);
339 # then add to gene structure object
340 $gene->add_exon($predobj, $ExonTags{$flds[1]});
341 } elsif($flds[1] eq 'PlyA') {
342 $predobj->primary_tag("PolyAsite");
343 $gene->poly_A_site($predobj);
344 } elsif($flds[1] eq 'Prom') {
345 $predobj->primary_tag("Promoter");
346 $gene->add_promoter($predobj);
350 if(/^\s*$/ && defined($gene)) {
351 # current gene is completed
352 $gene->seq_id($seqname);
353 $self->_add_prediction($gene);
357 if(/^(GENSCAN)\s+(\S+)/) {
358 $self->analysis_method($1);
359 $self->analysis_method_version($2);
362 if(/^Sequence\s+(\S+)\s*:/) {
367 if(/^Parameter matrix:\s+(\S+)/i) {
368 $self->analysis_subject($1);
372 if(/^Predicted coding/) {
377 # section of predicted sequences
378 $self->_pushback($_);
382 $self->_predictions_parsed(1);
387 Title : _prediction()
388 Usage : $gene = $obj->_prediction()
398 return unless(exists($self->{'_preds'}) && @
{$self->{'_preds'}});
399 return shift(@
{$self->{'_preds'}});
402 =head2 _add_prediction
404 Title : _add_prediction()
405 Usage : $obj->_add_prediction($gene)
412 sub _add_prediction
{
413 my ($self, $gene) = @_;
415 if(! exists($self->{'_preds'})) {
416 $self->{'_preds'} = [];
418 push(@
{$self->{'_preds'}}, $gene);
421 =head2 _predictions_parsed
423 Title : _predictions_parsed
424 Usage : $obj->_predictions_parsed
427 Returns : TRUE or FALSE
431 sub _predictions_parsed
{
432 my ($self, $val) = @_;
434 $self->{'_preds_parsed'} = $val if $val;
435 if(! exists($self->{'_preds_parsed'})) {
436 $self->{'_preds_parsed'} = 0;
438 return $self->{'_preds_parsed'};
444 Usage : $obj->_has_cds()
445 Function: Whether or not the result contains the predicted CDSs, too.
447 Returns : TRUE or FALSE
452 my ($self, $val) = @_;
454 $self->{'_has_cds'} = $val if $val;
455 if(! exists($self->{'_has_cds'})) {
456 $self->{'_has_cds'} = 0;
458 return $self->{'_has_cds'};
461 =head2 _read_fasta_seq
463 Title : _read_fasta_seq()
464 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
465 Function: Simple but specialised FASTA format sequence reader. Uses
466 $self->_readline() to retrieve input, and is able to strip off
467 the traling description lines.
469 Returns : An array of two elements.
473 sub _read_fasta_seq
{
478 my $entry = $self->_readline();
481 # complete the entry if the first line came from a pushback buffer
482 while($entry !~ />$/) {
483 last unless $_ = $self->_readline();
486 # delete everything onwards from an intervening empty line (at the
487 # end there might be statistics stuff)
488 $entry =~ s/\n\n.*$//s;
490 if($entry =~ /^(\S+)\n([^>]+)/) {
494 $self->throw("Can't parse Genscan predicted sequence entry");
496 $seq =~ s/\s//g; # Remove whitespace