maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / Tools / Genscan.pm
blob4076c1a96c7ce96488753d32abeadbafa9d2fca8
2 # BioPerl module for Bio::Tools::Genscan
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp <hlapp@gmx.net>
8 # Copyright Hilmar Lapp
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Genscan - Results of one Genscan run
18 =head1 SYNOPSIS
20 use Bio::Tools::Genscan;
22 $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan');
23 # filehandle:
24 $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT );
26 # parse the results
27 # note: this class is-a Bio::Tools::AnalysisResult which implements
28 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
29 while($gene = $genscan->next_prediction()) {
30 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
31 # off Bio::SeqFeature::Gene::Transcript.
33 # $gene->exons() returns an array of
34 # Bio::Tools::Prediction::Exon objects
35 # all exons:
36 @exon_arr = $gene->exons();
38 # initial exons only
39 @init_exons = $gene->exons('Initial');
40 # internal exons only
41 @intrl_exons = $gene->exons('Internal');
42 # terminal exons only
43 @term_exons = $gene->exons('Terminal');
44 # singleton exons:
45 ($single_exon) = $gene->exons();
48 # essential if you gave a filename at initialization (otherwise the file
49 # will stay open)
50 $genscan->close();
52 =head1 DESCRIPTION
54 The Genscan module provides a parser for Genscan gene structure prediction
55 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
56 derived object.
58 This module also implements the Bio::SeqAnalysisParserI interface, and thus
59 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
61 =head1 FEEDBACK
63 =head2 Mailing Lists
65 User feedback is an integral part of the evolution of this and other
66 Bioperl modules. Send your comments and suggestions preferably to one
67 of the Bioperl mailing lists. Your participation is much appreciated.
69 bioperl-l@bioperl.org - General discussion
70 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 =head2 Support
74 Please direct usage questions or support issues to the mailing list:
76 I<bioperl-l@bioperl.org>
78 rather than to the module maintainer directly. Many experienced and
79 reponsive experts will be able look at the problem and quickly
80 address it. Please include a thorough description of the problem
81 with code and data examples if at all possible.
83 =head2 Reporting Bugs
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 the bugs and their resolution. Bug reports can be submitted via the
87 web:
89 https://github.com/bioperl/bioperl-live/issues
91 =head1 AUTHOR - Hilmar Lapp
93 Email hlapp@gmx.net
95 =head1 APPENDIX
97 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
99 =cut
102 # Let the code begin...
105 package Bio::Tools::Genscan;
106 use strict;
107 use Symbol;
109 use Bio::Root::Root;
110 use Bio::Tools::Prediction::Gene;
111 use Bio::Tools::Prediction::Exon;
113 use base qw(Bio::Tools::AnalysisResult);
115 my %ExonTags = ('Init' => 'Initial',
116 'Intr' => 'Internal',
117 'Term' => 'Terminal',
118 'Sngl' => '');
120 sub _initialize_state {
121 my ($self,@args) = @_;
123 # first call the inherited method!
124 $self->SUPER::_initialize_state(@args);
126 # our private state variables
127 $self->{'_preds_parsed'} = 0;
128 $self->{'_has_cds'} = 0;
129 # array of pre-parsed predictions
130 $self->{'_preds'} = [];
131 # seq stack
132 $self->{'_seqstack'} = [];
135 =head2 analysis_method
137 Usage : $genscan->analysis_method();
138 Purpose : Inherited method. Overridden to ensure that the name matches
139 /genscan/i.
140 Returns : String
141 Argument : n/a
143 =cut
145 #-------------
146 sub analysis_method {
147 #-------------
148 my ($self, $method) = @_;
149 if($method && ($method !~ /genscan/i)) {
150 $self->throw("method $method not supported in " . ref($self));
152 return $self->SUPER::analysis_method($method);
155 =head2 next_feature
157 Title : next_feature
158 Usage : while($gene = $genscan->next_feature()) {
159 # do something
161 Function: Returns the next gene structure prediction of the Genscan result
162 file. Call this method repeatedly until FALSE is returned.
164 The returned object is actually a SeqFeatureI implementing object.
165 This method is required for classes implementing the
166 SeqAnalysisParserI interface, and is merely an alias for
167 next_prediction() at present.
169 Example :
170 Returns : A Bio::Tools::Prediction::Gene object.
171 Args :
173 =cut
175 sub next_feature {
176 my ($self,@args) = @_;
177 # even though next_prediction doesn't expect any args (and this method
178 # does neither), we pass on args in order to be prepared if this changes
179 # ever
180 return $self->next_prediction(@args);
183 =head2 next_prediction
185 Title : next_prediction
186 Usage : while($gene = $genscan->next_prediction()) {
187 # do something
189 Function: Returns the next gene structure prediction of the Genscan result
190 file. Call this method repeatedly until FALSE is returned.
192 Example :
193 Returns : A Bio::Tools::Prediction::Gene object.
194 Args :
196 =cut
198 sub next_prediction {
199 my ($self) = @_;
200 my $gene;
202 # if the prediction section hasn't been parsed yet, we do this now
203 $self->_parse_predictions() unless $self->_predictions_parsed();
205 # get next gene structure
206 $gene = $self->_prediction();
208 if($gene) {
209 # fill in predicted protein, and if available the predicted CDS
211 my ($id, $seq);
212 # use the seq stack if there's a seq on it
213 my $seqobj = pop(@{$self->{'_seqstack'}});
214 if(! $seqobj) {
215 # otherwise read from input stream
216 ($id, $seq) = $self->_read_fasta_seq();
217 # there may be no sequence at all, or none any more
218 if($id && $seq) {
219 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
220 '-display_id' => $id,
221 '-alphabet' => "protein");
224 if($seqobj) {
225 # check that prediction number matches the prediction number
226 # indicated in the sequence id (there may be incomplete gene
227 # predictions that contain only signals with no associated protein
228 # and CDS, like promoters, poly-A sites etc)
229 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
230 my $prednr = $1;
231 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) {
232 # this is not our sequence, so push back for next prediction
233 push(@{$self->{'_seqstack'}}, $seqobj);
234 } else {
235 $gene->predicted_protein($seqobj);
236 # CDS prediction, too?
237 if($self->_has_cds()) {
238 ($id, $seq) = $self->_read_fasta_seq();
239 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
240 '-display_id' => $id,
241 '-alphabet' => "dna");
242 $gene->predicted_cds($seqobj);
248 return $gene;
251 =head2 _parse_predictions
253 Title : _parse_predictions()
254 Usage : $obj->_parse_predictions()
255 Function: Parses the prediction section. Automatically called by
256 next_prediction() if not yet done.
257 Example :
258 Returns :
260 =cut
262 sub _parse_predictions {
263 my ($self) = @_;
264 my $gene;
265 my $seqname;
267 while(defined($_ = $self->_readline())) {
268 if(/^\s*(\d+)\.(\d+)/) {
269 # exon or signal
270 my $prednr = $1;
271 my $signalnr = $2; # not used presently
272 if(! defined($gene)) {
273 $gene = Bio::Tools::Prediction::Gene->new(
274 '-primary' => "GenePrediction$prednr",
275 '-source' => 'Genscan');
277 # split into fields
278 chomp();
279 my @flds = split(' ', $_);
280 # create the feature object depending on the type of signal
281 my $predobj;
282 my $is_exon = grep {$_ eq $flds[1];} (keys(%ExonTags));
283 if($is_exon) {
284 $predobj = Bio::Tools::Prediction::Exon->new();
285 } else {
286 # PolyA site, or Promoter
287 $predobj = Bio::SeqFeature::Generic->new();
289 # set common fields
290 $predobj->source_tag('Genscan');
291 $predobj->score($flds[$#flds]);
292 $predobj->strand((($flds[2] eq '+') ? 1 : -1));
293 my ($start, $end) = @flds[(3,4)];
294 if($predobj->strand() == 1) {
295 $predobj->start($start);
296 $predobj->end($end);
297 } else {
298 $predobj->end($start);
299 $predobj->start($end);
301 # add to gene structure (should be done only when start and end
302 # are set, in order to allow for proper expansion of the range)
303 if($is_exon) {
304 # first, set fields unique to exons
305 $predobj->start_signal_score($flds[8]);
306 $predobj->end_signal_score($flds[9]);
307 $predobj->coding_signal_score($flds[10]);
308 $predobj->significance($flds[11]);
309 $predobj->primary_tag($ExonTags{$flds[1]} . 'Exon');
310 $predobj->is_coding(1);
311 # Figure out the frame of this exon. This is NOT the frame
312 # given by Genscan, which is the absolute frame of the base
313 # starting the first predicted complete codon. By comparing
314 # to the absolute frame of the first base we can compute the
315 # offset of the first complete codon to the first base of the
316 # exon, which determines the frame of the exon.
317 my $cod_offset;
318 if($predobj->strand() == 1) {
319 $cod_offset = $flds[6] - (($predobj->start()-1) % 3);
320 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
321 # to offsets 2 and 1, resp. Offset 3 is the same as 0.
322 $cod_offset += 3 if($cod_offset < 1);
323 } else {
324 # On the reverse strand the Genscan frame also refers to
325 # the first base of the first complete codon, but viewed
326 # from forward, which is the third base viewed from
327 # reverse.
328 $cod_offset = $flds[6] - (($predobj->end()-3) % 3);
329 # Possible values are -2, -1, 0, 1, 2. Due to the reverse
330 # situation, {2,-1} and {1,-2} correspond to offsets
331 # 1 and 2, resp. Offset 3 is the same as 0.
332 $cod_offset -= 3 if($cod_offset >= 0);
333 $cod_offset = -$cod_offset;
335 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
336 # is the frame of the first base relative to the exon, or the
337 # number of bases the first codon is missing).
338 $predobj->frame(3 - $cod_offset);
339 # then add to gene structure object
340 $gene->add_exon($predobj, $ExonTags{$flds[1]});
341 } elsif($flds[1] eq 'PlyA') {
342 $predobj->primary_tag("PolyAsite");
343 $gene->poly_A_site($predobj);
344 } elsif($flds[1] eq 'Prom') {
345 $predobj->primary_tag("Promoter");
346 $gene->add_promoter($predobj);
348 next;
350 if(/^\s*$/ && defined($gene)) {
351 # current gene is completed
352 $gene->seq_id($seqname);
353 $self->_add_prediction($gene);
354 $gene = undef;
355 next;
357 if(/^(GENSCAN)\s+(\S+)/) {
358 $self->analysis_method($1);
359 $self->analysis_method_version($2);
360 next;
362 if(/^Sequence\s+(\S+)\s*:/) {
363 $seqname = $1;
364 next;
367 if(/^Parameter matrix:\s+(\S+)/i) {
368 $self->analysis_subject($1);
369 next;
372 if(/^Predicted coding/) {
373 $self->_has_cds(1);
374 next;
376 /^>/ && do {
377 # section of predicted sequences
378 $self->_pushback($_);
379 last;
382 $self->_predictions_parsed(1);
385 =head2 _prediction
387 Title : _prediction()
388 Usage : $gene = $obj->_prediction()
389 Function: internal
390 Example :
391 Returns :
393 =cut
395 sub _prediction {
396 my ($self) = @_;
398 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
399 return shift(@{$self->{'_preds'}});
402 =head2 _add_prediction
404 Title : _add_prediction()
405 Usage : $obj->_add_prediction($gene)
406 Function: internal
407 Example :
408 Returns :
410 =cut
412 sub _add_prediction {
413 my ($self, $gene) = @_;
415 if(! exists($self->{'_preds'})) {
416 $self->{'_preds'} = [];
418 push(@{$self->{'_preds'}}, $gene);
421 =head2 _predictions_parsed
423 Title : _predictions_parsed
424 Usage : $obj->_predictions_parsed
425 Function: internal
426 Example :
427 Returns : TRUE or FALSE
429 =cut
431 sub _predictions_parsed {
432 my ($self, $val) = @_;
434 $self->{'_preds_parsed'} = $val if $val;
435 if(! exists($self->{'_preds_parsed'})) {
436 $self->{'_preds_parsed'} = 0;
438 return $self->{'_preds_parsed'};
441 =head2 _has_cds
443 Title : _has_cds()
444 Usage : $obj->_has_cds()
445 Function: Whether or not the result contains the predicted CDSs, too.
446 Example :
447 Returns : TRUE or FALSE
449 =cut
451 sub _has_cds {
452 my ($self, $val) = @_;
454 $self->{'_has_cds'} = $val if $val;
455 if(! exists($self->{'_has_cds'})) {
456 $self->{'_has_cds'} = 0;
458 return $self->{'_has_cds'};
461 =head2 _read_fasta_seq
463 Title : _read_fasta_seq()
464 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
465 Function: Simple but specialised FASTA format sequence reader. Uses
466 $self->_readline() to retrieve input, and is able to strip off
467 the traling description lines.
468 Example :
469 Returns : An array of two elements.
471 =cut
473 sub _read_fasta_seq {
474 my ($self) = @_;
475 my ($id, $seq);
476 local $/ = ">";
478 my $entry = $self->_readline();
479 if($entry) {
480 $entry =~ s/^>//;
481 # complete the entry if the first line came from a pushback buffer
482 while($entry !~ />$/) {
483 last unless $_ = $self->_readline();
484 $entry .= $_;
486 # delete everything onwards from an intervening empty line (at the
487 # end there might be statistics stuff)
488 $entry =~ s/\n\n.*$//s;
489 # id and sequence
490 if($entry =~ /^(\S+)\n([^>]+)/) {
491 $id = $1;
492 $seq = $2;
493 } else {
494 $self->throw("Can't parse Genscan predicted sequence entry");
496 $seq =~ s/\s//g; # Remove whitespace
498 return ($id, $seq);