2 # BioPerl module for Bio::Tools::Fgenesh
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Christopher Dwan (chris@dwan.org)
8 # Copied, lock stock & barrel from Genscan.pm
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::Fgenesh - parse results of one Fgenesh run
20 use Bio::Tools::Fgenesh;
22 $fgenesh = Bio::Tools::Fgenesh->new(-file => 'result.fgenesh');
24 $fgenesh = Bio::Tools::Fgenesh->new( -fh => \*INPUT );
27 # note: this class is-a Bio::Tools::AnalysisResult which implements
28 # Bio::SeqAnalysisParserI, i.e., $fgensh->next_feature() is the same
29 while($gene = $fgenesh->next_prediction()) {
30 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
31 # off Bio::SeqFeature::Gene::Transcript.
33 # $gene->exons() returns an array of
34 # Bio::Tools::Prediction::Exon objects
36 @exon_arr = $gene->exons();
39 @init_exons = $gene->exons('Initial');
41 @intrl_exons = $gene->exons('Internal');
43 @term_exons = $gene->exons('Terminal');
45 ($single_exon) = $gene->exons();
48 # essential if you gave a filename at initialization (otherwise the file
54 The Fgenesh module provides a parser for Fgenesh (version 2) gene structure
55 prediction output. It parses one gene prediction into a
56 Bio::SeqFeature::Gene::Transcript- derived object.
58 This module also implements the L<Bio::SeqAnalysisParserI> interface, and thus
59 can be used wherever such an object fits.
65 User feedback is an integral part of the evolution of this and other
66 Bioperl modules. Send your comments and suggestions preferably to one
67 of the Bioperl mailing lists. Your participation is much appreciated.
69 bioperl-l@bioperl.org - General discussion
70 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74 Please direct usage questions or support issues to the mailing list:
76 I<bioperl-l@bioperl.org>
78 rather than to the module maintainer directly. Many experienced and
79 reponsive experts will be able look at the problem and quickly
80 address it. Please include a thorough description of the problem
81 with code and data examples if at all possible.
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 the bugs and their resolution. Bug reports can be submitted via the
89 https://github.com/bioperl/bioperl-live/issues
91 =head1 AUTHOR - Chris Dwan
93 Email chris-at-dwan.org
97 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
102 # Let the code begin...
105 package Bio
::Tools
::Fgenesh
;
111 use Bio
::Tools
::Prediction
::Gene
;
112 use Bio
::Tools
::Prediction
::Exon
;
114 use base
qw(Bio::Tools::AnalysisResult);
116 my %ExonTags = ('CDSf' => 'Initial',
117 'CDSi' => 'Internal',
118 'CDSl' => 'Terminal',
119 'CDSo' => 'Singleton');
121 sub _initialize_state
{
122 my ($self,@args) = @_;
124 # first call the inherited method!
125 $self->SUPER::_initialize_state
(@args);
127 # our private state variables
128 $self->{'_preds_parsed'} = 0;
129 $self->{'_has_cds'} = 0;
130 # array of pre-parsed predictions
131 $self->{'_preds'} = [];
133 $self->{'_seqstack'} = [];
136 =head2 analysis_method
138 Usage : $genscan->analysis_method();
139 Purpose : Inherited method. Overridden to ensure that the name matches
147 sub analysis_method
{
149 my ($self, $method) = @_;
150 if($method && ($method !~ /fgenesh/i)) {
151 $self->throw("method $method not supported in " . ref($self));
153 return $self->SUPER::analysis_method
($method);
159 Usage : while($gene = $fgenesh->next_feature()) {
162 Function: Returns the next gene structure prediction of the Fgenesh result
163 file. Call this method repeatedly until FALSE is returned.
165 The returned object is actually a SeqFeatureI implementing object.
166 This method is required for classes implementing the
167 SeqAnalysisParserI interface, and is merely an alias for
168 next_prediction() at present.
171 Returns : A Bio::Tools::Prediction::Gene object.
177 my ($self,@args) = @_;
178 # even though next_prediction doesn't expect any args (and this method
179 # does neither), we pass on args in order to be prepared if this changes
181 return $self->next_prediction(@args);
184 =head2 next_prediction
186 Title : next_prediction
187 Usage : while($gene = $fgenesh->next_prediction()) { ... }
188 Function: Returns the next gene structure prediction of the Genscan result
189 file. Call this method repeatedly until FALSE is returned.
191 Returns : A Bio::Tools::Prediction::Gene object.
196 sub next_prediction
{
200 # if the prediction section hasn't been parsed yet, we do this now
201 $self->_parse_predictions() unless $self->_predictions_parsed();
203 # get next gene structure
204 $gene = $self->_prediction();
207 # fill in predicted protein, and if available the predicted CDS
210 # use the seq stack if there's a seq on it
211 my $seqobj = pop(@
{$self->{'_seqstack'}});
214 ($id, $seq) = $self->_read_fasta_seq();
216 if (($id =~ /mrna/) || ($id =~ /cds/)) { $alphabet = 'dna'; }
217 else { $alphabet = 'protein'; }
218 $seqobj = Bio
::PrimarySeq
->new('-seq' => $seq,
219 '-display_id' => $id,
220 '-alphabet' => $alphabet);
224 # check that prediction number matches the prediction number
225 # indicated in the sequence id (there may be incomplete gene
226 # predictions that contain only signals with no associated protein
229 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
231 if ($id !~ /_predicted_(\w+)_$prednr/) {
232 # this is not our sequence, so push back for next prediction
233 push(@
{$self->{'_seqstack'}}, $seqobj);
235 if ($1 eq "protein") {
236 $gene->predicted_protein($seqobj);
237 } elsif (($1 eq "mrna") || ($1 eq "cds")) {
239 $gene->predicted_cds($seqobj);
241 # Have to go back in and get the protein...
242 ($id, $seq) = $self->_read_fasta_seq();
243 if ($id =~ /_cds_/) {
244 ($id, $seq) = $self->_read_fasta_seq();
247 $seqobj = Bio
::PrimarySeq
->new('-seq' => $seq,
248 '-display_id' => $id,
249 '-alphabet' => "protein");
250 $gene->predicted_protein($seqobj);
259 =head2 _parse_predictions
261 Title : _parse_predictions()
262 Usage : $obj->_parse_predictions()
263 Function: Parses the prediction section. Automatically called by
264 next_prediction() if not yet done.
270 sub _parse_predictions
{
275 while(defined($_ = $self->_readline())) {
277 if(/^\s*(\d+)\s+([+\-])/) {
282 my $strand = ($2 eq '+') ?
1 : -1;
284 if(! defined($gene)) {
285 $gene = Bio
::Tools
::Prediction
::Gene
->new(
286 '-primary' => "GenePrediction$prednr",
287 '-source' => 'Fgenesh');
291 my @flds = split(/\s+/, ' ' . $line);
292 ## NB - the above adds leading whitespace before the gene
293 ## number in case there was none (as quick patch to code
294 ## below which expects it but it is not present after 999
295 ## predictions!) This allows >999 predictions to be parsed.
297 # create the feature object depending on the type of signal
299 my $is_exon = grep {$line =~ $_} keys(%ExonTags);
302 $predobj = Bio
::Tools
::Prediction
::Exon
->new();
303 $predobj->score($flds[8]);
308 $predobj = Bio
::SeqFeature
::Generic
->new();
309 $predobj->score($flds[5]);
314 $predobj->source_tag('Fgenesh');
315 $predobj->strand($strand);
317 # Following tactical commenting-out made by
318 # malcolm.cook@stowers-institute.org since coordinate reversal is
319 # apparently vestigial copy/paste detritus from Genscan.pm origins of
320 # this module and this is NOT needed for fgenesh (at least in v
323 # if($predobj->strand() == 1) {
324 $predobj->start($start);
327 # $predobj->end($start);
328 # $predobj->start($end);
331 # print STDERR "start $start end $end\n";
332 # add to gene structure (should be done only when start and end
333 # are set, in order to allow for proper expansion of the range)
335 # first, set fields unique to exons
336 $predobj->primary_tag($ExonTags{$flds[4]} . 'Exon');
337 $predobj->is_coding(1);
339 if($predobj->strand() == 1) {
340 $cod_offset = ($flds[9] - $predobj->start()) % 3;
341 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
342 # to offsets 2 and 1, resp. Offset 3 is the same as 0.
343 $cod_offset += 3 if($cod_offset < 1);
345 # On the reverse strand the Genscan frame also refers to
346 # the first base of the first complete codon, but viewed
347 # from forward, which is the third base viewed from
349 $cod_offset = ($flds[11] - $predobj->end()) % 3;
350 # Possible values are -2, -1, 0, 1, 2. Due to the reverse
351 # situation, {2,-1} and {1,-2} correspond to offsets
352 # 1 and 2, resp. Offset 3 is the same as 0.
353 $cod_offset -= 3 if($cod_offset >= 0);
354 $cod_offset = -$cod_offset;
356 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
357 # is the frame of the first base relative to the exon, or the
358 # number of bases the first codon is missing).
359 $predobj->frame(3 - $cod_offset);
360 # print STDERR " frame is " . $predobj->frame() . "\n";
361 # then add to gene structure object
362 $gene->add_exon($predobj, $ExonTags{$flds[1]});
363 } elsif($flds[3] eq 'PolA') {
364 $predobj->primary_tag("PolyAsite");
365 $gene->poly_A_site($predobj);
366 } elsif($flds[3] eq 'TSS') {
367 $predobj->primary_tag("Promoter"); # (hey! a TSS is NOT a promoter... what's going on here?...
368 $gene->add_promoter($predobj);
369 #I'd like to do this (for now):
370 #$predobj->primary_tag("TSS"); #this is not the right model, but, it IS a feature at least.
371 #but the followg errs out
372 #$gene->add_SeqFeature($predobj); #err: MSG: start is undefined when bounds at Bio::SeqFeature::Generic::add_SeqFeature 671 check since gene has no start yet
375 $self->throw("unrecognized prediction line: " . $line);
380 if(/^\s*$/ && defined($gene)) {
381 # current gene is completed
382 $gene->seq_id($seqname);
383 $self->_add_prediction($gene);
388 if(/^(FGENESH)\s+([\d\.]+)/) {
389 $self->analysis_method($1);
390 $self->analysis_method_version($2);
391 if (/\s(\S+)\sgenomic DNA/) {
392 $self->analysis_subject($1);
397 if(/^\s*Seq name:\s+(\S+)/) {
402 /^Predicted protein/ && do {
403 # section of predicted sequences
404 $self->_pushback($_);
409 $self->_predictions_parsed(1);
414 Title : _prediction()
415 Usage : $gene = $obj->_prediction()
425 return unless(exists($self->{'_preds'}) && @
{$self->{'_preds'}});
426 return shift(@
{$self->{'_preds'}});
429 =head2 _add_prediction
431 Title : _add_prediction()
432 Usage : $obj->_add_prediction($gene)
439 sub _add_prediction
{
440 my ($self, $gene) = @_;
442 if(! exists($self->{'_preds'})) {
443 $self->{'_preds'} = [];
445 push(@
{$self->{'_preds'}}, $gene);
448 =head2 _predictions_parsed
450 Title : _predictions_parsed
451 Usage : $obj->_predictions_parsed
454 Returns : TRUE or FALSE
458 sub _predictions_parsed
{
459 my ($self, $val) = @_;
461 $self->{'_preds_parsed'} = $val if $val;
462 if(! exists($self->{'_preds_parsed'})) {
463 $self->{'_preds_parsed'} = 0;
465 return $self->{'_preds_parsed'};
471 Usage : $obj->_has_cds()
472 Function: Whether or not the result contains the predicted CDSs, too.
474 Returns : TRUE or FALSE
479 my ($self, $val) = @_;
481 $self->{'_has_cds'} = $val if $val;
482 if(! exists($self->{'_has_cds'})) {
483 $self->{'_has_cds'} = 0;
485 return $self->{'_has_cds'};
488 =head2 _read_fasta_seq
490 Title : _read_fasta_seq()
491 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
492 Function: Simple but specialised FASTA format sequence reader. Uses
493 $self->_readline() to retrieve input, and is able to strip off
494 the traling description lines.
496 Returns : An array of two elements: fasta_id & sequence
500 sub _read_fasta_seq
{
505 my $entry = $self->_readline();
506 # print " ^^ $entry\n";
507 return unless ($entry);
508 $entry = $self->_readline() if ($entry =~ /^Predicted protein/);
509 # print " ^^ $entry\n";
511 # Pick up the header / id.
512 if ($entry =~ /^>FGENESH:/) {
513 if ($entry =~ /^>FGENESH:\s+(\d+)/) {
514 # print STDERR " this is a predicted gene\n";
515 $id = "_predicted_protein_" . $1;
516 } elsif ($entry =~ /^>FGENESH:\[mRNA\]\s*(\d+)/) {
517 # print STDERR " this is an mRNA\n";
518 $id = "_predicted_mrna_" . $1;
519 } elsif ($entry =~ /^>FGENESH:\[exon\]\s+Gene:\s*(\d+)/) {
520 $id = "_predicted_cds_" . $1;
523 $entry = $self->_readline();
528 # print "*** $entry\n";
529 if (($entry =~ /^>FGENESH:\[exon\]/) && ($id =~ /^_predicted_cds_/)) {
530 # print STDERR " -- informed about an exon header...\n";
531 $entry = $self->_readline();
534 # print STDERR " Added $entry\n";
537 last unless $entry = $self->_readline();
538 if (($entry =~ /^>/) &&
539 (!(($entry =~ /^>FGENESH:\[exon\]/) && ($id =~ /^_predicted_cds_/)))) {
540 $self->_pushback($entry); last;
545 $seq =~ s/\s//g; # Remove whitespace