Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / Tools / Genemark.pm
blobb229ec32983b0888b7328d21b0691a134714d721
2 # BioPerl module for Bio::Tools::Genemark
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Mark Fiers <hlapp@gmx.net>
8 # Copyright Hilmar Lapp, Mark Fiers
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Genemark - Results of one Genemark run
18 =head1 SYNOPSIS
20 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
21 # filehandle:
22 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
24 # parse the results
25 # note: this class is-a Bio::Tools::AnalysisResult which implements
26 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
27 while($gene = $Genemark->next_prediction()) {
28 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
29 # off Bio::SeqFeature::Gene::Transcript.
31 # $gene->exons() returns an array of
32 # Bio::Tools::Prediction::Exon objects
33 # all exons:
34 @exon_arr = $gene->exons();
36 # initial exons only
37 @init_exons = $gene->exons('Initial');
38 # internal exons only
39 @intrl_exons = $gene->exons('Internal');
40 # terminal exons only
41 @term_exons = $gene->exons('Terminal');
42 # singleton exons:
43 ($single_exon) = $gene->exons();
46 # essential if you gave a filename at initialization (otherwise the file
47 # will stay open)
48 $Genemark->close();
50 =head1 DESCRIPTION
52 The Genemark module provides a parser for Genemark gene structure
53 prediction output. It parses one gene prediction into a
54 Bio::SeqFeature::Gene::Transcript- derived object.
56 This module has been developed around genemark.hmm for eukaryots v2.2a
57 and will probably not work with other versions.
60 This module also implements the Bio::SeqAnalysisParserI interface, and
61 thus can be used wherever such an object fits. See
62 L<Bio::SeqAnalysisParserI>.
64 =head1 FEEDBACK
66 =head2 Mailing Lists
68 User feedback is an integral part of the evolution of this and other
69 Bioperl modules. Send your comments and suggestions preferably to one
70 of the Bioperl mailing lists. Your participation is much appreciated.
72 bioperl-l@bioperl.org - General discussion
73 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
75 =head2 Support
77 Please direct usage questions or support issues to the mailing list:
79 I<bioperl-l@bioperl.org>
81 rather than to the module maintainer directly. Many experienced and
82 reponsive experts will be able look at the problem and quickly
83 address it. Please include a thorough description of the problem
84 with code and data examples if at all possible.
86 =head2 Reporting Bugs
88 Report bugs to the Bioperl bug tracking system to help us keep track
89 the bugs and their resolution. Bug reports can be submitted via the
90 web:
92 https://github.com/bioperl/bioperl-live/issues
94 =head1 AUTHOR - Hilmar Lapp, Mark Fiers
96 Email hlapp@gmx.net
97 m.w.e.j.fiers@plant.wag-ur.nl
99 =head1 APPENDIX
101 The rest of the documentation details each of the object
102 methods. Internal methods are usually preceded with a _
104 =cut
107 # Let the code begin...
110 package Bio::Tools::Genemark;
112 use strict;
113 use Symbol;
115 use Bio::Root::Root;
116 use Bio::Tools::Prediction::Gene;
117 use Bio::Tools::Prediction::Exon;
118 use Bio::Seq;
119 use Bio::Factory::FTLocationFactory;
121 use base qw(Bio::Tools::AnalysisResult);
123 =head2 new
125 Title : new
126 Usage : my $obj = Bio::Tools::Genemark->new();
127 Function: Builds a new Bio::Tools::Genemark object
128 Returns : an instance of Bio::Tools::Genemark
129 Args : seqname
132 =cut
134 sub new {
135 my($class,@args) = @_;
137 my $self = $class->SUPER::new(@args);
139 my ($seqname) = $self->_rearrange([qw(SEQNAME)], @args);
141 # hardwire seq_id when creating gene and exon objects
142 $self->_seqname($seqname) if defined($seqname);
144 return $self;
147 sub _initialize_state {
148 my ($self,@args) = @_;
150 # first call the inherited method!
151 $self->SUPER::_initialize_state(@args);
153 # our private state variables
154 $self->{'_preds_parsed'} = 0;
155 $self->{'_has_cds'} = 0;
156 # array of pre-parsed predictions
157 $self->{'_preds'} = [];
158 # seq stack
159 $self->{'_seqstack'} = [];
162 =head2 analysis_method
164 Usage : $Genemark->analysis_method();
165 Purpose : Inherited method. Overridden to ensure that the name matches
166 /GeneMark.hmm/i.
167 Returns : String
168 Argument : n/a
170 =cut
172 #-------------
173 sub analysis_method {
174 #-------------
175 my ($self, $method) = @_;
176 if($method && ($method !~ /Genemark\.hmm/i)) {
177 $self->throw("method $method not supported in " . ref($self));
179 return $self->SUPER::analysis_method($method);
182 =head2 next_feature
184 Title : next_feature
185 Usage : while($gene = $Genemark->next_feature()) {
186 # do something
188 Function: Returns the next gene structure prediction of the Genemark result
189 file. Call this method repeatedly until FALSE is returned.
191 The returned object is actually a SeqFeatureI implementing object.
192 This method is required for classes implementing the
193 SeqAnalysisParserI interface, and is merely an alias for
194 next_prediction() at present.
196 Example :
197 Returns : A Bio::Tools::Prediction::Gene object.
198 Args :
200 =cut
202 sub next_feature {
203 my ($self,@args) = @_;
204 # even though next_prediction doesn't expect any args (and this method
205 # does neither), we pass on args in order to be prepared if this changes
206 # ever
207 return $self->next_prediction(@args);
210 =head2 next_prediction
212 Title : next_prediction
213 Usage : while($gene = $Genemark->next_prediction()) {
214 # do something
216 Function: Returns the next gene structure prediction of the Genemark result
217 file. Call this method repeatedly until FALSE is returned.
219 Example :
220 Returns : A Bio::Tools::Prediction::Gene object.
221 Args :
223 =cut
225 sub next_prediction {
226 my ($self) = @_;
227 my $gene;
229 # if the prediction section hasn't been parsed yet, we do this now
230 $self->_parse_predictions() unless $self->_predictions_parsed();
232 # get next gene structure
233 $gene = $self->_prediction();
235 return $gene;
238 =head2 _parse_predictions
240 Title : _parse_predictions()
241 Usage : $obj->_parse_predictions()
242 Function: Parses the prediction section. Automatically called by
243 next_prediction() if not yet done.
244 Example :
245 Returns :
247 =cut
249 sub _parse_predictions {
250 my ($self) = @_;
251 my %exontags = ('Initial' => 'Initial',
252 'Internal' => 'Internal',
253 'Terminal' => 'Terminal',
254 'Single' => '',
255 '_na_' => '');
256 my $exontag;
257 my $gene;
258 my $exontype;
259 my $current_gene_no = -1;
261 # The prediction report does not contain a sequence identifier
262 # (at least the prokaryotic version doesn't)
263 my $seqname = $self->_seqname();
265 while(defined($_ = $self->_readline())) {
267 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
269 # this is an exon, Genemark doesn't predict anything else
270 # $prednr corresponds to geneno.
271 my $prednr = $1;
273 #exon no:
274 my $signalnr = 0;
275 if ($2) { my $signalnr = $2; } # used in tag: exon_no
277 # split into fields
278 chomp();
279 my @flds = split(' ', $_);
281 # create the feature (an exon) object
282 my $predobj = Bio::Tools::Prediction::Exon->new();
285 # define info depending on it being eu- or prokaryot
286 my ($start, $end, $orientation, $prediction_source);
288 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
289 $prediction_source = "Genemark.hmm.pro";
290 $orientation = ($flds[1] eq '+') ? 1 : -1;
291 ($start, $end) = @flds[(2,3)];
292 $exontag = "_na_";
294 } else {
295 $prediction_source = "Genemark.hmm.eu";
296 $orientation = ($flds[2] eq '+') ? 1 : -1;
297 ($start, $end) = @flds[(4,5)];
298 $exontag = $flds[3];
301 # instatiate a location object via
302 # Bio::Factory::FTLocationFactory (to handle
303 # inexact coordinates)
304 my $location_string = join('..', $start, $end);
305 my $location_factory = Bio::Factory::FTLocationFactory->new();
306 my $location_obj = $location_factory->from_string($location_string);
307 $predobj->location($location_obj);
309 #store the data in the exon object
310 $predobj->source_tag($prediction_source);
311 $predobj->strand($orientation);
313 $predobj->primary_tag($exontags{$exontag} . "Exon");
315 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
317 $predobj->is_coding(1);
319 $predobj->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
321 # frame calculation as in the genscan module
322 # is to be implemented...
324 #If the $prednr is not equal to the current gene, we
325 #need to make a new gene and close the old one
326 if($prednr != $current_gene_no) {
327 # a new gene, store the old one if it exists
328 if (defined ($gene)) {
329 $gene->seq_id($seqname);
330 $gene = undef ;
332 #and make a new one
333 $gene = Bio::Tools::Prediction::Gene->new
335 '-primary' => "GenePrediction$prednr",
336 '-source' => $prediction_source);
337 $self->_add_prediction($gene);
338 $current_gene_no = $prednr;
339 $gene->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
342 # Add the exon to the gene
343 $gene->add_exon($predobj, ($exontag eq "_na_" ?
344 undef : $exontags{$exontag}));
348 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
349 $self->analysis_method($1);
351 my $gm_version = $2;
353 $self->analysis_method_version($gm_version);
354 next;
357 #Matrix file for eukaryot version
358 if (/^Matrices file:\s+(\S+)?/i) {
359 $self->analysis_subject($1);
360 # since the line after the matrix file is always the date
361 # (in the output file's I have seen!) extract and store this
362 # here
363 if (defined(my $_date = $self->_readline())) {
364 chomp ($_date);
365 $self->analysis_date($_date);
369 #Matrix file for prokaryot version
370 if (/^Model file name:\s+(\S+)/) {
371 $self->analysis_subject($1);
372 # since the line after the matrix file is always the date
373 # (in the output file's I have seen!) extract and store this
374 # here
375 my $_date = $self->_readline() ;
376 if (defined($_date = $self->_readline())) {
377 chomp ($_date);
378 $self->analysis_date($_date);
382 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
383 $seqname = $1;
384 # $self->analysis_subject($seqname);
385 next;
389 /^>/ && do {
390 $self->_pushback($_);
392 # section of predicted aa sequences on recognition
393 # of a fasta start, read all sequences and find the
394 # appropriate gene
395 while (1) {
396 my ($aa_id, $seq) = $self->_read_fasta_seq();
397 last unless ($aa_id);
399 #now parse through the predictions to add the pred. protein
400 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) {
401 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
402 my $geneno = $1;
403 if ($aa_id =~ /\|gene.$geneno\|/) {
404 #print "x SEQ : \n $seq \nXXXX\n";
405 my $seqobj = Bio::Seq->new('-seq' => $seq,
406 '-display_id' => $aa_id,
407 '-alphabet' => "protein");
408 $gene->predicted_protein($seqobj);
409 last FINDPRED;
415 last;
419 # if the analysis query object contains a ref to a Seq of PrimarySeq
420 # object, then extract the predicted sequences and add it to the gene
421 # object.
422 if (defined $self->analysis_query()) {
423 my $orig_seq = $self->analysis_query();
424 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) {
425 my $predseq = "";
426 foreach my $exon ($gene->exons()) {
427 #print $exon->start() . " " . $exon->end () . "\n";
428 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
431 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq,
432 '-display_id' => "transl");
433 $gene->predicted_cds($seqobj);
438 $self->_predictions_parsed(1);
441 =head2 _prediction
443 Title : _prediction()
444 Usage : $gene = $obj->_prediction()
445 Function: internal
446 Example :
447 Returns :
449 =cut
451 sub _prediction {
452 my ($self) = @_;
454 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
455 return shift(@{$self->{'_preds'}});
458 =head2 _add_prediction
460 Title : _add_prediction()
461 Usage : $obj->_add_prediction($gene)
462 Function: internal
463 Example :
464 Returns :
466 =cut
468 sub _add_prediction {
469 my ($self, $gene) = @_;
471 if(! exists($self->{'_preds'})) {
472 $self->{'_preds'} = [];
474 push(@{$self->{'_preds'}}, $gene);
477 =head2 _predictions_parsed
479 Title : _predictions_parsed
480 Usage : $obj->_predictions_parsed
481 Function: internal
482 Example :
483 Returns : TRUE or FALSE
485 =cut
487 sub _predictions_parsed {
488 my ($self, $val) = @_;
490 $self->{'_preds_parsed'} = $val if $val;
491 if(! exists($self->{'_preds_parsed'})) {
492 $self->{'_preds_parsed'} = 0;
494 return $self->{'_preds_parsed'};
497 =head2 _has_cds
499 Title : _has_cds()
500 Usage : $obj->_has_cds()
501 Function: Whether or not the result contains the predicted CDSs, too.
502 Example :
503 Returns : TRUE or FALSE
505 =cut
507 sub _has_cds {
508 my ($self, $val) = @_;
510 $self->{'_has_cds'} = $val if $val;
511 if(! exists($self->{'_has_cds'})) {
512 $self->{'_has_cds'} = 0;
514 return $self->{'_has_cds'};
517 =head2 _read_fasta_seq
519 Title : _read_fasta_seq()
520 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
521 Function: Simple but specialised FASTA format sequence reader. Uses
522 $self->_readline() to retrieve input, and is able to strip off
523 the traling description lines.
524 Example :
525 Returns : An array of two elements.
527 =cut
529 sub _read_fasta_seq {
530 my ($self) = @_;
531 my ($id, $seq);
532 local $/ = ">";
534 return 0 unless (my $entry = $self->_readline());
536 $entry =~ s/^>//;
537 # complete the entry if the first line came from a pushback buffer
538 while(! ($entry =~ />$/)) {
539 last unless ($_ = $self->_readline());
540 $entry .= $_;
543 # delete everything onwards from an new fasta start (>)
544 $entry =~ s/\n>.*$//s;
545 # id and sequence
547 if($entry =~ s/^(.+)\n//) {
548 $id = $1;
549 $id =~ s/ /_/g;
550 $seq = $entry;
551 $seq =~ s/\s//g;
552 #print "\n@@ $id \n@@ $seq \n##\n";
553 } else {
554 $self->throw("Can't parse Genemark predicted sequence entry");
556 $seq =~ s/\s//g; # Remove whitespace
557 return ($id, $seq);
560 =head2 _seqname
562 Title : _seqname
563 Usage : $obj->_seqname($seqname)
564 Function: internal
565 Example :
566 Returns : String
568 =cut
570 sub _seqname {
571 my ($self, $val) = @_;
573 $self->{'_seqname'} = $val if $val;
574 if(! exists($self->{'_seqname'})) {
575 $self->{'_seqname'} = 'unknown';
577 return $self->{'_seqname'};