Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / Genemark.pm
blobb7117b30870f70b8f9b6a0a1a992d6edf04720fd
2 # BioPerl module for Bio::Tools::Genemark
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Mark Fiers <hlapp@gmx.net>
8 # Copyright Hilmar Lapp, Mark Fiers
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Genemark - Results of one Genemark run
18 =head1 SYNOPSIS
20 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
21 # filehandle:
22 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
24 # parse the results
25 # note: this class is-a Bio::Tools::AnalysisResult which implements
26 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
27 while($gene = $Genemark->next_prediction()) {
28 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
29 # off Bio::SeqFeature::Gene::Transcript.
31 # $gene->exons() returns an array of
32 # Bio::Tools::Prediction::Exon objects
33 # all exons:
34 @exon_arr = $gene->exons();
36 # initial exons only
37 @init_exons = $gene->exons('Initial');
38 # internal exons only
39 @intrl_exons = $gene->exons('Internal');
40 # terminal exons only
41 @term_exons = $gene->exons('Terminal');
42 # singleton exons:
43 ($single_exon) = $gene->exons();
46 # essential if you gave a filename at initialization (otherwise the file
47 # will stay open)
48 $Genemark->close();
50 =head1 DESCRIPTION
52 The Genemark module provides a parser for Genemark gene structure
53 prediction output. It parses one gene prediction into a
54 Bio::SeqFeature::Gene::Transcript- derived object.
56 This module has been developed around genemark.hmm for eukaryots v2.2a
57 and will probably not work with other versions.
60 This module also implements the Bio::SeqAnalysisParserI interface, and
61 thus can be used wherever such an object fits. See
62 L<Bio::SeqAnalysisParserI>.
64 =head1 FEEDBACK
66 =head2 Mailing Lists
68 User feedback is an integral part of the evolution of this and other
69 Bioperl modules. Send your comments and suggestions preferably to one
70 of the Bioperl mailing lists. Your participation is much appreciated.
72 bioperl-l@bioperl.org - General discussion
73 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
75 =head2 Support
77 Please direct usage questions or support issues to the mailing list:
79 I<bioperl-l@bioperl.org>
81 rather than to the module maintainer directly. Many experienced and
82 reponsive experts will be able look at the problem and quickly
83 address it. Please include a thorough description of the problem
84 with code and data examples if at all possible.
86 =head2 Reporting Bugs
88 Report bugs to the Bioperl bug tracking system to help us keep track
89 the bugs and their resolution. Bug reports can be submitted via the
90 web:
92 https://github.com/bioperl/bioperl-live/issues
94 =head1 AUTHOR - Hilmar Lapp, Mark Fiers
96 Email hlapp@gmx.net
97 m.w.e.j.fiers@plant.wag-ur.nl
99 =head1 APPENDIX
101 The rest of the documentation details each of the object
102 methods. Internal methods are usually preceded with a _
104 =cut
107 # Let the code begin...
110 package Bio::Tools::Genemark;
111 use strict;
112 use Symbol;
114 use Bio::Root::Root;
115 use Bio::Tools::Prediction::Gene;
116 use Bio::Tools::Prediction::Exon;
117 use Bio::Seq;
118 use Bio::Factory::FTLocationFactory;
120 use base qw(Bio::Tools::AnalysisResult);
122 =head2 new
124 Title : new
125 Usage : my $obj = Bio::Tools::Genemark->new();
126 Function: Builds a new Bio::Tools::Genemark object
127 Returns : an instance of Bio::Tools::Genemark
128 Args : seqname
131 =cut
133 sub new {
134 my($class,@args) = @_;
136 my $self = $class->SUPER::new(@args);
138 my ($seqname) = $self->_rearrange([qw(SEQNAME)], @args);
140 # hardwire seq_id when creating gene and exon objects
141 $self->_seqname($seqname) if defined($seqname);
143 return $self;
146 sub _initialize_state {
147 my ($self,@args) = @_;
149 # first call the inherited method!
150 $self->SUPER::_initialize_state(@args);
152 # our private state variables
153 $self->{'_preds_parsed'} = 0;
154 $self->{'_has_cds'} = 0;
155 # array of pre-parsed predictions
156 $self->{'_preds'} = [];
157 # seq stack
158 $self->{'_seqstack'} = [];
161 =head2 analysis_method
163 Usage : $Genemark->analysis_method();
164 Purpose : Inherited method. Overridden to ensure that the name matches
165 /GeneMark.hmm/i.
166 Returns : String
167 Argument : n/a
169 =cut
171 #-------------
172 sub analysis_method {
173 #-------------
174 my ($self, $method) = @_;
175 if($method && ($method !~ /Genemark\.hmm/i)) {
176 $self->throw("method $method not supported in " . ref($self));
178 return $self->SUPER::analysis_method($method);
181 =head2 next_feature
183 Title : next_feature
184 Usage : while($gene = $Genemark->next_feature()) {
185 # do something
187 Function: Returns the next gene structure prediction of the Genemark result
188 file. Call this method repeatedly until FALSE is returned.
190 The returned object is actually a SeqFeatureI implementing object.
191 This method is required for classes implementing the
192 SeqAnalysisParserI interface, and is merely an alias for
193 next_prediction() at present.
195 Example :
196 Returns : A Bio::Tools::Prediction::Gene object.
197 Args :
199 =cut
201 sub next_feature {
202 my ($self,@args) = @_;
203 # even though next_prediction doesn't expect any args (and this method
204 # does neither), we pass on args in order to be prepared if this changes
205 # ever
206 return $self->next_prediction(@args);
209 =head2 next_prediction
211 Title : next_prediction
212 Usage : while($gene = $Genemark->next_prediction()) {
213 # do something
215 Function: Returns the next gene structure prediction of the Genemark result
216 file. Call this method repeatedly until FALSE is returned.
218 Example :
219 Returns : A Bio::Tools::Prediction::Gene object.
220 Args :
222 =cut
224 sub next_prediction {
225 my ($self) = @_;
226 my $gene;
228 # if the prediction section hasn't been parsed yet, we do this now
229 $self->_parse_predictions() unless $self->_predictions_parsed();
231 # get next gene structure
232 $gene = $self->_prediction();
234 return $gene;
237 =head2 _parse_predictions
239 Title : _parse_predictions()
240 Usage : $obj->_parse_predictions()
241 Function: Parses the prediction section. Automatically called by
242 next_prediction() if not yet done.
243 Example :
244 Returns :
246 =cut
248 sub _parse_predictions {
249 my ($self) = @_;
250 my %exontags = ('Initial' => 'Initial',
251 'Internal' => 'Internal',
252 'Terminal' => 'Terminal',
253 'Single' => '',
254 '_na_' => '');
255 my $exontag;
256 my $gene;
257 my $exontype;
258 my $current_gene_no = -1;
260 # The prediction report does not contain a sequence identifier
261 # (at least the prokaryotic version doesn't)
262 my $seqname = $self->_seqname();
264 while(defined($_ = $self->_readline())) {
266 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
268 # this is an exon, Genemark doesn't predict anything else
269 # $prednr corresponds to geneno.
270 my $prednr = $1;
272 #exon no:
273 my $signalnr = 0;
274 if ($2) { my $signalnr = $2; } # used in tag: exon_no
276 # split into fields
277 chomp();
278 my @flds = split(' ', $_);
280 # create the feature (an exon) object
281 my $predobj = Bio::Tools::Prediction::Exon->new();
284 # define info depending on it being eu- or prokaryot
285 my ($start, $end, $orientation, $prediction_source);
287 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
288 $prediction_source = "Genemark.hmm.pro";
289 $orientation = ($flds[1] eq '+') ? 1 : -1;
290 ($start, $end) = @flds[(2,3)];
291 $exontag = "_na_";
293 } else {
294 $prediction_source = "Genemark.hmm.eu";
295 $orientation = ($flds[2] eq '+') ? 1 : -1;
296 ($start, $end) = @flds[(4,5)];
297 $exontag = $flds[3];
300 # instatiate a location object via
301 # Bio::Factory::FTLocationFactory (to handle
302 # inexact coordinates)
303 my $location_string = join('..', $start, $end);
304 my $location_factory = Bio::Factory::FTLocationFactory->new();
305 my $location_obj = $location_factory->from_string($location_string);
306 $predobj->location($location_obj);
308 #store the data in the exon object
309 $predobj->source_tag($prediction_source);
310 $predobj->strand($orientation);
312 $predobj->primary_tag($exontags{$exontag} . "Exon");
314 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
316 $predobj->is_coding(1);
318 $predobj->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
320 # frame calculation as in the genscan module
321 # is to be implemented...
323 #If the $prednr is not equal to the current gene, we
324 #need to make a new gene and close the old one
325 if($prednr != $current_gene_no) {
326 # a new gene, store the old one if it exists
327 if (defined ($gene)) {
328 $gene->seq_id($seqname);
329 $gene = undef ;
331 #and make a new one
332 $gene = Bio::Tools::Prediction::Gene->new
334 '-primary' => "GenePrediction$prednr",
335 '-source' => $prediction_source);
336 $self->_add_prediction($gene);
337 $current_gene_no = $prednr;
338 $gene->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
341 # Add the exon to the gene
342 $gene->add_exon($predobj, ($exontag eq "_na_" ?
343 undef : $exontags{$exontag}));
347 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
348 $self->analysis_method($1);
350 my $gm_version = $2;
352 $self->analysis_method_version($gm_version);
353 next;
356 #Matrix file for eukaryot version
357 if (/^Matrices file:\s+(\S+)?/i) {
358 $self->analysis_subject($1);
359 # since the line after the matrix file is always the date
360 # (in the output file's I have seen!) extract and store this
361 # here
362 if (defined(my $_date = $self->_readline())) {
363 chomp ($_date);
364 $self->analysis_date($_date);
368 #Matrix file for prokaryot version
369 if (/^Model file name:\s+(\S+)/) {
370 $self->analysis_subject($1);
371 # since the line after the matrix file is always the date
372 # (in the output file's I have seen!) extract and store this
373 # here
374 my $_date = $self->_readline() ;
375 if (defined($_date = $self->_readline())) {
376 chomp ($_date);
377 $self->analysis_date($_date);
381 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
382 $seqname = $1;
383 # $self->analysis_subject($seqname);
384 next;
388 /^>/ && do {
389 $self->_pushback($_);
391 # section of predicted aa sequences on recognition
392 # of a fasta start, read all sequences and find the
393 # appropriate gene
394 while (1) {
395 my ($aa_id, $seq) = $self->_read_fasta_seq();
396 last unless ($aa_id);
398 #now parse through the predictions to add the pred. protein
399 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) {
400 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
401 my $geneno = $1;
402 if ($aa_id =~ /\|gene.$geneno\|/) {
403 #print "x SEQ : \n $seq \nXXXX\n";
404 my $seqobj = Bio::Seq->new('-seq' => $seq,
405 '-display_id' => $aa_id,
406 '-alphabet' => "protein");
407 $gene->predicted_protein($seqobj);
408 last FINDPRED;
414 last;
418 # if the analysis query object contains a ref to a Seq of PrimarySeq
419 # object, then extract the predicted sequences and add it to the gene
420 # object.
421 if (defined $self->analysis_query()) {
422 my $orig_seq = $self->analysis_query();
423 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) {
424 my $predseq = "";
425 foreach my $exon ($gene->exons()) {
426 #print $exon->start() . " " . $exon->end () . "\n";
427 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
430 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq,
431 '-display_id' => "transl");
432 $gene->predicted_cds($seqobj);
437 $self->_predictions_parsed(1);
440 =head2 _prediction
442 Title : _prediction()
443 Usage : $gene = $obj->_prediction()
444 Function: internal
445 Example :
446 Returns :
448 =cut
450 sub _prediction {
451 my ($self) = @_;
453 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
454 return shift(@{$self->{'_preds'}});
457 =head2 _add_prediction
459 Title : _add_prediction()
460 Usage : $obj->_add_prediction($gene)
461 Function: internal
462 Example :
463 Returns :
465 =cut
467 sub _add_prediction {
468 my ($self, $gene) = @_;
470 if(! exists($self->{'_preds'})) {
471 $self->{'_preds'} = [];
473 push(@{$self->{'_preds'}}, $gene);
476 =head2 _predictions_parsed
478 Title : _predictions_parsed
479 Usage : $obj->_predictions_parsed
480 Function: internal
481 Example :
482 Returns : TRUE or FALSE
484 =cut
486 sub _predictions_parsed {
487 my ($self, $val) = @_;
489 $self->{'_preds_parsed'} = $val if $val;
490 if(! exists($self->{'_preds_parsed'})) {
491 $self->{'_preds_parsed'} = 0;
493 return $self->{'_preds_parsed'};
496 =head2 _has_cds
498 Title : _has_cds()
499 Usage : $obj->_has_cds()
500 Function: Whether or not the result contains the predicted CDSs, too.
501 Example :
502 Returns : TRUE or FALSE
504 =cut
506 sub _has_cds {
507 my ($self, $val) = @_;
509 $self->{'_has_cds'} = $val if $val;
510 if(! exists($self->{'_has_cds'})) {
511 $self->{'_has_cds'} = 0;
513 return $self->{'_has_cds'};
516 =head2 _read_fasta_seq
518 Title : _read_fasta_seq()
519 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
520 Function: Simple but specialised FASTA format sequence reader. Uses
521 $self->_readline() to retrieve input, and is able to strip off
522 the traling description lines.
523 Example :
524 Returns : An array of two elements.
526 =cut
528 sub _read_fasta_seq {
529 my ($self) = @_;
530 my ($id, $seq);
531 local $/ = ">";
533 return 0 unless (my $entry = $self->_readline());
535 $entry =~ s/^>//;
536 # complete the entry if the first line came from a pushback buffer
537 while(! ($entry =~ />$/)) {
538 last unless ($_ = $self->_readline());
539 $entry .= $_;
542 # delete everything onwards from an new fasta start (>)
543 $entry =~ s/\n>.*$//s;
544 # id and sequence
546 if($entry =~ s/^(.+)\n//) {
547 $id = $1;
548 $id =~ s/ /_/g;
549 $seq = $entry;
550 $seq =~ s/\s//g;
551 #print "\n@@ $id \n@@ $seq \n##\n";
552 } else {
553 $self->throw("Can't parse Genemark predicted sequence entry");
555 $seq =~ s/\s//g; # Remove whitespace
556 return ($id, $seq);
559 =head2 _seqname
561 Title : _seqname
562 Usage : $obj->_seqname($seqname)
563 Function: internal
564 Example :
565 Returns : String
567 =cut
569 sub _seqname {
570 my ($self, $val) = @_;
572 $self->{'_seqname'} = $val if $val;
573 if(! exists($self->{'_seqname'})) {
574 $self->{'_seqname'} = 'unknown';
576 return $self->{'_seqname'};