2 # BioPerl module for Bio::Tools::Genemark
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Mark Fiers <hlapp@gmx.net>
8 # Copyright Hilmar Lapp, Mark Fiers
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::Genemark - Results of one Genemark run
20 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
22 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
25 # note: this class is-a Bio::Tools::AnalysisResult which implements
26 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
27 while($gene = $Genemark->next_prediction()) {
28 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
29 # off Bio::SeqFeature::Gene::Transcript.
31 # $gene->exons() returns an array of
32 # Bio::Tools::Prediction::Exon objects
34 @exon_arr = $gene->exons();
37 @init_exons = $gene->exons('Initial');
39 @intrl_exons = $gene->exons('Internal');
41 @term_exons = $gene->exons('Terminal');
43 ($single_exon) = $gene->exons();
46 # essential if you gave a filename at initialization (otherwise the file
52 The Genemark module provides a parser for Genemark gene structure
53 prediction output. It parses one gene prediction into a
54 Bio::SeqFeature::Gene::Transcript- derived object.
56 This module has been developed around genemark.hmm for eukaryots v2.2a
57 and will probably not work with other versions.
60 This module also implements the Bio::SeqAnalysisParserI interface, and
61 thus can be used wherever such an object fits. See
62 L<Bio::SeqAnalysisParserI>.
68 User feedback is an integral part of the evolution of this and other
69 Bioperl modules. Send your comments and suggestions preferably to one
70 of the Bioperl mailing lists. Your participation is much appreciated.
72 bioperl-l@bioperl.org - General discussion
73 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
77 Please direct usage questions or support issues to the mailing list:
79 I<bioperl-l@bioperl.org>
81 rather than to the module maintainer directly. Many experienced and
82 reponsive experts will be able look at the problem and quickly
83 address it. Please include a thorough description of the problem
84 with code and data examples if at all possible.
88 Report bugs to the Bioperl bug tracking system to help us keep track
89 the bugs and their resolution. Bug reports can be submitted via the
92 https://github.com/bioperl/bioperl-live/issues
94 =head1 AUTHOR - Hilmar Lapp, Mark Fiers
97 m.w.e.j.fiers@plant.wag-ur.nl
101 The rest of the documentation details each of the object
102 methods. Internal methods are usually preceded with a _
107 # Let the code begin...
110 package Bio
::Tools
::Genemark
;
116 use Bio
::Tools
::Prediction
::Gene
;
117 use Bio
::Tools
::Prediction
::Exon
;
119 use Bio
::Factory
::FTLocationFactory
;
121 use base
qw(Bio::Tools::AnalysisResult);
126 Usage : my $obj = Bio::Tools::Genemark->new();
127 Function: Builds a new Bio::Tools::Genemark object
128 Returns : an instance of Bio::Tools::Genemark
135 my($class,@args) = @_;
137 my $self = $class->SUPER::new
(@args);
139 my ($seqname) = $self->_rearrange([qw(SEQNAME)], @args);
141 # hardwire seq_id when creating gene and exon objects
142 $self->_seqname($seqname) if defined($seqname);
147 sub _initialize_state
{
148 my ($self,@args) = @_;
150 # first call the inherited method!
151 $self->SUPER::_initialize_state
(@args);
153 # our private state variables
154 $self->{'_preds_parsed'} = 0;
155 $self->{'_has_cds'} = 0;
156 # array of pre-parsed predictions
157 $self->{'_preds'} = [];
159 $self->{'_seqstack'} = [];
162 =head2 analysis_method
164 Usage : $Genemark->analysis_method();
165 Purpose : Inherited method. Overridden to ensure that the name matches
173 sub analysis_method
{
175 my ($self, $method) = @_;
176 if($method && ($method !~ /Genemark\.hmm/i)) {
177 $self->throw("method $method not supported in " . ref($self));
179 return $self->SUPER::analysis_method
($method);
185 Usage : while($gene = $Genemark->next_feature()) {
188 Function: Returns the next gene structure prediction of the Genemark result
189 file. Call this method repeatedly until FALSE is returned.
191 The returned object is actually a SeqFeatureI implementing object.
192 This method is required for classes implementing the
193 SeqAnalysisParserI interface, and is merely an alias for
194 next_prediction() at present.
197 Returns : A Bio::Tools::Prediction::Gene object.
203 my ($self,@args) = @_;
204 # even though next_prediction doesn't expect any args (and this method
205 # does neither), we pass on args in order to be prepared if this changes
207 return $self->next_prediction(@args);
210 =head2 next_prediction
212 Title : next_prediction
213 Usage : while($gene = $Genemark->next_prediction()) {
216 Function: Returns the next gene structure prediction of the Genemark result
217 file. Call this method repeatedly until FALSE is returned.
220 Returns : A Bio::Tools::Prediction::Gene object.
225 sub next_prediction
{
229 # if the prediction section hasn't been parsed yet, we do this now
230 $self->_parse_predictions() unless $self->_predictions_parsed();
232 # get next gene structure
233 $gene = $self->_prediction();
238 =head2 _parse_predictions
240 Title : _parse_predictions()
241 Usage : $obj->_parse_predictions()
242 Function: Parses the prediction section. Automatically called by
243 next_prediction() if not yet done.
249 sub _parse_predictions
{
251 my %exontags = ('Initial' => 'Initial',
252 'Internal' => 'Internal',
253 'Terminal' => 'Terminal',
259 my $current_gene_no = -1;
261 # The prediction report does not contain a sequence identifier
262 # (at least the prokaryotic version doesn't)
263 my $seqname = $self->_seqname();
265 while(defined($_ = $self->_readline())) {
267 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
269 # this is an exon, Genemark doesn't predict anything else
270 # $prednr corresponds to geneno.
275 if ($2) { my $signalnr = $2; } # used in tag: exon_no
279 my @flds = split(' ', $_);
281 # create the feature (an exon) object
282 my $predobj = Bio
::Tools
::Prediction
::Exon
->new();
285 # define info depending on it being eu- or prokaryot
286 my ($start, $end, $orientation, $prediction_source);
288 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
289 $prediction_source = "Genemark.hmm.pro";
290 $orientation = ($flds[1] eq '+') ?
1 : -1;
291 ($start, $end) = @flds[(2,3)];
295 $prediction_source = "Genemark.hmm.eu";
296 $orientation = ($flds[2] eq '+') ?
1 : -1;
297 ($start, $end) = @flds[(4,5)];
301 # instatiate a location object via
302 # Bio::Factory::FTLocationFactory (to handle
303 # inexact coordinates)
304 my $location_string = join('..', $start, $end);
305 my $location_factory = Bio
::Factory
::FTLocationFactory
->new();
306 my $location_obj = $location_factory->from_string($location_string);
307 $predobj->location($location_obj);
309 #store the data in the exon object
310 $predobj->source_tag($prediction_source);
311 $predobj->strand($orientation);
313 $predobj->primary_tag($exontags{$exontag} . "Exon");
315 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
317 $predobj->is_coding(1);
319 $predobj->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
321 # frame calculation as in the genscan module
322 # is to be implemented...
324 #If the $prednr is not equal to the current gene, we
325 #need to make a new gene and close the old one
326 if($prednr != $current_gene_no) {
327 # a new gene, store the old one if it exists
328 if (defined ($gene)) {
329 $gene->seq_id($seqname);
333 $gene = Bio
::Tools
::Prediction
::Gene
->new
335 '-primary' => "GenePrediction$prednr",
336 '-source' => $prediction_source);
337 $self->_add_prediction($gene);
338 $current_gene_no = $prednr;
339 $gene->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
342 # Add the exon to the gene
343 $gene->add_exon($predobj, ($exontag eq "_na_" ?
344 undef : $exontags{$exontag}));
348 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
349 $self->analysis_method($1);
353 $self->analysis_method_version($gm_version);
357 #Matrix file for eukaryot version
358 if (/^Matrices file:\s+(\S+)?/i) {
359 $self->analysis_subject($1);
360 # since the line after the matrix file is always the date
361 # (in the output file's I have seen!) extract and store this
363 if (defined(my $_date = $self->_readline())) {
365 $self->analysis_date($_date);
369 #Matrix file for prokaryot version
370 if (/^Model file name:\s+(\S+)/) {
371 $self->analysis_subject($1);
372 # since the line after the matrix file is always the date
373 # (in the output file's I have seen!) extract and store this
375 my $_date = $self->_readline() ;
376 if (defined($_date = $self->_readline())) {
378 $self->analysis_date($_date);
382 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
384 # $self->analysis_subject($seqname);
390 $self->_pushback($_);
392 # section of predicted aa sequences on recognition
393 # of a fasta start, read all sequences and find the
396 my ($aa_id, $seq) = $self->_read_fasta_seq();
397 last unless ($aa_id);
399 #now parse through the predictions to add the pred. protein
400 FINDPRED
: foreach my $gene (@
{$self->{'_preds'}}) {
401 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
403 if ($aa_id =~ /\|gene.$geneno\|/) {
404 #print "x SEQ : \n $seq \nXXXX\n";
405 my $seqobj = Bio
::Seq
->new('-seq' => $seq,
406 '-display_id' => $aa_id,
407 '-alphabet' => "protein");
408 $gene->predicted_protein($seqobj);
419 # if the analysis query object contains a ref to a Seq of PrimarySeq
420 # object, then extract the predicted sequences and add it to the gene
422 if (defined $self->analysis_query()) {
423 my $orig_seq = $self->analysis_query();
424 FINDPREDSEQ
: foreach my $gene (@
{$self->{'_preds'}}) {
426 foreach my $exon ($gene->exons()) {
427 #print $exon->start() . " " . $exon->end () . "\n";
428 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
431 my $seqobj = Bio
::PrimarySeq
->new('-seq' => $predseq,
432 '-display_id' => "transl");
433 $gene->predicted_cds($seqobj);
438 $self->_predictions_parsed(1);
443 Title : _prediction()
444 Usage : $gene = $obj->_prediction()
454 return unless(exists($self->{'_preds'}) && @
{$self->{'_preds'}});
455 return shift(@
{$self->{'_preds'}});
458 =head2 _add_prediction
460 Title : _add_prediction()
461 Usage : $obj->_add_prediction($gene)
468 sub _add_prediction
{
469 my ($self, $gene) = @_;
471 if(! exists($self->{'_preds'})) {
472 $self->{'_preds'} = [];
474 push(@
{$self->{'_preds'}}, $gene);
477 =head2 _predictions_parsed
479 Title : _predictions_parsed
480 Usage : $obj->_predictions_parsed
483 Returns : TRUE or FALSE
487 sub _predictions_parsed
{
488 my ($self, $val) = @_;
490 $self->{'_preds_parsed'} = $val if $val;
491 if(! exists($self->{'_preds_parsed'})) {
492 $self->{'_preds_parsed'} = 0;
494 return $self->{'_preds_parsed'};
500 Usage : $obj->_has_cds()
501 Function: Whether or not the result contains the predicted CDSs, too.
503 Returns : TRUE or FALSE
508 my ($self, $val) = @_;
510 $self->{'_has_cds'} = $val if $val;
511 if(! exists($self->{'_has_cds'})) {
512 $self->{'_has_cds'} = 0;
514 return $self->{'_has_cds'};
517 =head2 _read_fasta_seq
519 Title : _read_fasta_seq()
520 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
521 Function: Simple but specialised FASTA format sequence reader. Uses
522 $self->_readline() to retrieve input, and is able to strip off
523 the traling description lines.
525 Returns : An array of two elements.
529 sub _read_fasta_seq
{
534 return 0 unless (my $entry = $self->_readline());
537 # complete the entry if the first line came from a pushback buffer
538 while(! ($entry =~ />$/)) {
539 last unless ($_ = $self->_readline());
543 # delete everything onwards from an new fasta start (>)
544 $entry =~ s/\n>.*$//s;
547 if($entry =~ s/^(.+)\n//) {
552 #print "\n@@ $id \n@@ $seq \n##\n";
554 $self->throw("Can't parse Genemark predicted sequence entry");
556 $seq =~ s/\s//g; # Remove whitespace
563 Usage : $obj->_seqname($seqname)
571 my ($self, $val) = @_;
573 $self->{'_seqname'} = $val if $val;
574 if(! exists($self->{'_seqname'})) {
575 $self->{'_seqname'} = 'unknown';
577 return $self->{'_seqname'};