2 # BioPerl module for Bio::Tools::Genemark
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Mark Fiers <hlapp@gmx.net>
8 # Copyright Hilmar Lapp, Mark Fiers
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::Genemark - Results of one Genemark run
20 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
22 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
25 # note: this class is-a Bio::Tools::AnalysisResult which implements
26 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
27 while($gene = $Genemark->next_prediction()) {
28 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
29 # off Bio::SeqFeature::Gene::Transcript.
31 # $gene->exons() returns an array of
32 # Bio::Tools::Prediction::Exon objects
34 @exon_arr = $gene->exons();
37 @init_exons = $gene->exons('Initial');
39 @intrl_exons = $gene->exons('Internal');
41 @term_exons = $gene->exons('Terminal');
43 ($single_exon) = $gene->exons();
46 # essential if you gave a filename at initialization (otherwise the file
52 The Genemark module provides a parser for Genemark gene structure
53 prediction output. It parses one gene prediction into a
54 Bio::SeqFeature::Gene::Transcript- derived object.
56 This module has been developed around genemark.hmm for eukaryots v2.2a
57 and will probably not work with other versions.
60 This module also implements the Bio::SeqAnalysisParserI interface, and
61 thus can be used wherever such an object fits. See
62 L<Bio::SeqAnalysisParserI>.
68 User feedback is an integral part of the evolution of this and other
69 Bioperl modules. Send your comments and suggestions preferably to one
70 of the Bioperl mailing lists. Your participation is much appreciated.
72 bioperl-l@bioperl.org - General discussion
73 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
77 Please direct usage questions or support issues to the mailing list:
79 I<bioperl-l@bioperl.org>
81 rather than to the module maintainer directly. Many experienced and
82 reponsive experts will be able look at the problem and quickly
83 address it. Please include a thorough description of the problem
84 with code and data examples if at all possible.
88 Report bugs to the Bioperl bug tracking system to help us keep track
89 the bugs and their resolution. Bug reports can be submitted via the
92 https://github.com/bioperl/bioperl-live/issues
94 =head1 AUTHOR - Hilmar Lapp, Mark Fiers
97 m.w.e.j.fiers@plant.wag-ur.nl
101 The rest of the documentation details each of the object
102 methods. Internal methods are usually preceded with a _
107 # Let the code begin...
110 package Bio
::Tools
::Genemark
;
115 use Bio
::Tools
::Prediction
::Gene
;
116 use Bio
::Tools
::Prediction
::Exon
;
118 use Bio
::Factory
::FTLocationFactory
;
120 use base
qw(Bio::Tools::AnalysisResult);
125 Usage : my $obj = Bio::Tools::Genemark->new();
126 Function: Builds a new Bio::Tools::Genemark object
127 Returns : an instance of Bio::Tools::Genemark
134 my($class,@args) = @_;
136 my $self = $class->SUPER::new
(@args);
138 my ($seqname) = $self->_rearrange([qw(SEQNAME)], @args);
140 # hardwire seq_id when creating gene and exon objects
141 $self->_seqname($seqname) if defined($seqname);
146 sub _initialize_state
{
147 my ($self,@args) = @_;
149 # first call the inherited method!
150 $self->SUPER::_initialize_state
(@args);
152 # our private state variables
153 $self->{'_preds_parsed'} = 0;
154 $self->{'_has_cds'} = 0;
155 # array of pre-parsed predictions
156 $self->{'_preds'} = [];
158 $self->{'_seqstack'} = [];
161 =head2 analysis_method
163 Usage : $Genemark->analysis_method();
164 Purpose : Inherited method. Overridden to ensure that the name matches
172 sub analysis_method
{
174 my ($self, $method) = @_;
175 if($method && ($method !~ /Genemark\.hmm/i)) {
176 $self->throw("method $method not supported in " . ref($self));
178 return $self->SUPER::analysis_method
($method);
184 Usage : while($gene = $Genemark->next_feature()) {
187 Function: Returns the next gene structure prediction of the Genemark result
188 file. Call this method repeatedly until FALSE is returned.
190 The returned object is actually a SeqFeatureI implementing object.
191 This method is required for classes implementing the
192 SeqAnalysisParserI interface, and is merely an alias for
193 next_prediction() at present.
196 Returns : A Bio::Tools::Prediction::Gene object.
202 my ($self,@args) = @_;
203 # even though next_prediction doesn't expect any args (and this method
204 # does neither), we pass on args in order to be prepared if this changes
206 return $self->next_prediction(@args);
209 =head2 next_prediction
211 Title : next_prediction
212 Usage : while($gene = $Genemark->next_prediction()) {
215 Function: Returns the next gene structure prediction of the Genemark result
216 file. Call this method repeatedly until FALSE is returned.
219 Returns : A Bio::Tools::Prediction::Gene object.
224 sub next_prediction
{
228 # if the prediction section hasn't been parsed yet, we do this now
229 $self->_parse_predictions() unless $self->_predictions_parsed();
231 # get next gene structure
232 $gene = $self->_prediction();
237 =head2 _parse_predictions
239 Title : _parse_predictions()
240 Usage : $obj->_parse_predictions()
241 Function: Parses the prediction section. Automatically called by
242 next_prediction() if not yet done.
248 sub _parse_predictions
{
250 my %exontags = ('Initial' => 'Initial',
251 'Internal' => 'Internal',
252 'Terminal' => 'Terminal',
258 my $current_gene_no = -1;
260 # The prediction report does not contain a sequence identifier
261 # (at least the prokaryotic version doesn't)
262 my $seqname = $self->_seqname();
264 while(defined($_ = $self->_readline())) {
266 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
268 # this is an exon, Genemark doesn't predict anything else
269 # $prednr corresponds to geneno.
274 if ($2) { my $signalnr = $2; } # used in tag: exon_no
278 my @flds = split(' ', $_);
280 # create the feature (an exon) object
281 my $predobj = Bio
::Tools
::Prediction
::Exon
->new();
284 # define info depending on it being eu- or prokaryot
285 my ($start, $end, $orientation, $prediction_source);
287 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
288 $prediction_source = "Genemark.hmm.pro";
289 $orientation = ($flds[1] eq '+') ?
1 : -1;
290 ($start, $end) = @flds[(2,3)];
294 $prediction_source = "Genemark.hmm.eu";
295 $orientation = ($flds[2] eq '+') ?
1 : -1;
296 ($start, $end) = @flds[(4,5)];
300 # instatiate a location object via
301 # Bio::Factory::FTLocationFactory (to handle
302 # inexact coordinates)
303 my $location_string = join('..', $start, $end);
304 my $location_factory = Bio
::Factory
::FTLocationFactory
->new();
305 my $location_obj = $location_factory->from_string($location_string);
306 $predobj->location($location_obj);
308 #store the data in the exon object
309 $predobj->source_tag($prediction_source);
310 $predobj->strand($orientation);
312 $predobj->primary_tag($exontags{$exontag} . "Exon");
314 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
316 $predobj->is_coding(1);
318 $predobj->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
320 # frame calculation as in the genscan module
321 # is to be implemented...
323 #If the $prednr is not equal to the current gene, we
324 #need to make a new gene and close the old one
325 if($prednr != $current_gene_no) {
326 # a new gene, store the old one if it exists
327 if (defined ($gene)) {
328 $gene->seq_id($seqname);
332 $gene = Bio
::Tools
::Prediction
::Gene
->new
334 '-primary' => "GenePrediction$prednr",
335 '-source' => $prediction_source);
336 $self->_add_prediction($gene);
337 $current_gene_no = $prednr;
338 $gene->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
341 # Add the exon to the gene
342 $gene->add_exon($predobj, ($exontag eq "_na_" ?
343 undef : $exontags{$exontag}));
347 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
348 $self->analysis_method($1);
352 $self->analysis_method_version($gm_version);
356 #Matrix file for eukaryot version
357 if (/^Matrices file:\s+(\S+)?/i) {
358 $self->analysis_subject($1);
359 # since the line after the matrix file is always the date
360 # (in the output file's I have seen!) extract and store this
362 if (defined(my $_date = $self->_readline())) {
364 $self->analysis_date($_date);
368 #Matrix file for prokaryot version
369 if (/^Model file name:\s+(\S+)/) {
370 $self->analysis_subject($1);
371 # since the line after the matrix file is always the date
372 # (in the output file's I have seen!) extract and store this
374 my $_date = $self->_readline() ;
375 if (defined($_date = $self->_readline())) {
377 $self->analysis_date($_date);
381 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
383 # $self->analysis_subject($seqname);
389 $self->_pushback($_);
391 # section of predicted aa sequences on recognition
392 # of a fasta start, read all sequences and find the
395 my ($aa_id, $seq) = $self->_read_fasta_seq();
396 last unless ($aa_id);
398 #now parse through the predictions to add the pred. protein
399 FINDPRED
: foreach my $gene (@
{$self->{'_preds'}}) {
400 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
402 if ($aa_id =~ /\|gene.$geneno\|/) {
403 #print "x SEQ : \n $seq \nXXXX\n";
404 my $seqobj = Bio
::Seq
->new('-seq' => $seq,
405 '-display_id' => $aa_id,
406 '-alphabet' => "protein");
407 $gene->predicted_protein($seqobj);
418 # if the analysis query object contains a ref to a Seq of PrimarySeq
419 # object, then extract the predicted sequences and add it to the gene
421 if (defined $self->analysis_query()) {
422 my $orig_seq = $self->analysis_query();
423 FINDPREDSEQ
: foreach my $gene (@
{$self->{'_preds'}}) {
425 foreach my $exon ($gene->exons()) {
426 #print $exon->start() . " " . $exon->end () . "\n";
427 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
430 my $seqobj = Bio
::PrimarySeq
->new('-seq' => $predseq,
431 '-display_id' => "transl");
432 $gene->predicted_cds($seqobj);
437 $self->_predictions_parsed(1);
442 Title : _prediction()
443 Usage : $gene = $obj->_prediction()
453 return unless(exists($self->{'_preds'}) && @
{$self->{'_preds'}});
454 return shift(@
{$self->{'_preds'}});
457 =head2 _add_prediction
459 Title : _add_prediction()
460 Usage : $obj->_add_prediction($gene)
467 sub _add_prediction
{
468 my ($self, $gene) = @_;
470 if(! exists($self->{'_preds'})) {
471 $self->{'_preds'} = [];
473 push(@
{$self->{'_preds'}}, $gene);
476 =head2 _predictions_parsed
478 Title : _predictions_parsed
479 Usage : $obj->_predictions_parsed
482 Returns : TRUE or FALSE
486 sub _predictions_parsed
{
487 my ($self, $val) = @_;
489 $self->{'_preds_parsed'} = $val if $val;
490 if(! exists($self->{'_preds_parsed'})) {
491 $self->{'_preds_parsed'} = 0;
493 return $self->{'_preds_parsed'};
499 Usage : $obj->_has_cds()
500 Function: Whether or not the result contains the predicted CDSs, too.
502 Returns : TRUE or FALSE
507 my ($self, $val) = @_;
509 $self->{'_has_cds'} = $val if $val;
510 if(! exists($self->{'_has_cds'})) {
511 $self->{'_has_cds'} = 0;
513 return $self->{'_has_cds'};
516 =head2 _read_fasta_seq
518 Title : _read_fasta_seq()
519 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
520 Function: Simple but specialised FASTA format sequence reader. Uses
521 $self->_readline() to retrieve input, and is able to strip off
522 the traling description lines.
524 Returns : An array of two elements.
528 sub _read_fasta_seq
{
533 return 0 unless (my $entry = $self->_readline());
536 # complete the entry if the first line came from a pushback buffer
537 while(! ($entry =~ />$/)) {
538 last unless ($_ = $self->_readline());
542 # delete everything onwards from an new fasta start (>)
543 $entry =~ s/\n>.*$//s;
546 if($entry =~ s/^(.+)\n//) {
551 #print "\n@@ $id \n@@ $seq \n##\n";
553 $self->throw("Can't parse Genemark predicted sequence entry");
555 $seq =~ s/\s//g; # Remove whitespace
562 Usage : $obj->_seqname($seqname)
570 my ($self, $val) = @_;
572 $self->{'_seqname'} = $val if $val;
573 if(! exists($self->{'_seqname'})) {
574 $self->{'_seqname'} = 'unknown';
576 return $self->{'_seqname'};