Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / Glimmer.pm
blob1d8cbfa98d709cc526ce466cf55bc4c4a4a5334e
2 # BioPerl module for Bio::Tools::Glimmer
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Glimmer - parser for Glimmer 2.X/3.X prokaryotic and
17 GlimmerM/GlimmerHMM eukaryotic gene predictions
19 =head1 SYNOPSIS
21 use Bio::Tools::Glimmer;
23 # file
24 my $parser = Bio::Tools::Glimmer->new(-file => $file);
25 # filehandle:
26 $parser = Bio::Tools::Glimmer->new( -fh => \*INPUT );
27 # provide a sequence identifier (Glimmer 2.X)
28 my $parser = Bio::Tools::Glimmer->new(-file => $file, -seqname => seqname);
29 # force format (override automatic detection)
30 my $parser = Bio::Tools::Glimmer->new(-file => $file, -format => 'GlimmerM');
32 # parse the results
33 # note: this class is-a Bio::Tools::AnalysisResult which implements
34 # Bio::SeqAnalysisParserI, i.e., $glimmer->next_feature() is the same
36 while(my $gene = $parser->next_prediction()) {
37 # For eukaryotic input (GlimmerM/GlimmerHMM), $gene will be an instance
38 # of Bio::Tools::Prediction::Gene, which inherits off
39 # Bio::SeqFeature::Gene::Transcript, and $gene->exons() will return an
40 # array of Bio::Tools::Prediction::Exon objects.
41 # For prokaryotic input (Glimmer2.X/Glimmer3.X), $gene will be an
42 # instance of Bio::SeqFeature::Generic
44 # all exons (eukaryotic only):
45 @exon_arr = $gene->exons();
46 # initial exons only
47 @init_exons = $gene->exons('Initial');
48 # internal exons only
49 @intrl_exons = $gene->exons('Internal');
50 # terminal exons only
51 @term_exons = $gene->exons('Terminal');
54 =head1 DESCRIPTION
56 This is a module for parsing Glimmer, GlimmerM and GlimmerHMM predictions.
57 It will create gene objects from the prediction report which can
58 be attached to a sequence using Bioperl objects, or output as GFF
59 suitable for loading into Bio::DB::GFF for use with Gbrowse.
61 Glimmer is open source and available at
62 L<http://www.cbcb.umd.edu/software/glimmer/>.
64 GlimmerM is open source and available at
65 L<http://www.tigr.org/software/glimmerm/>.
67 GlimmerHMM is open source and available at
68 L<http://www.cbcb.umd.edu/software/GlimmerHMM/>.
70 Note that Glimmer 2.X will only process the first
71 sequence in a fasta file, and the prediction report does not contain any
72 sort of sequence identifier
74 Note that Glimmer 3.X produces two output files. This module only parses
75 the .predict file.
78 =head1 FEEDBACK
80 =head2 Mailing Lists
82 User feedback is an integral part of the evolution of this and other
83 Bioperl modules. Send your comments and suggestions preferably to
84 the Bioperl mailing list. Your participation is much appreciated.
86 bioperl-l@bioperl.org - General discussion
87 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
89 =head2 Support
91 Please direct usage questions or support issues to the mailing list:
93 I<bioperl-l@bioperl.org>
95 rather than to the module maintainer directly. Many experienced and
96 reponsive experts will be able look at the problem and quickly
97 address it. Please include a thorough description of the problem
98 with code and data examples if at all possible.
100 =head2 Reporting Bugs
102 Report bugs to the Bioperl bug tracking system to help us keep track
103 of the bugs and their resolution. Bug reports can be submitted via
104 email or the web:
106 https://github.com/bioperl/bioperl-live/issues
108 =head1 AUTHOR - Jason Stajich
110 Email jason-at-bioperl-dot-org
112 =head1 CONTRIBUTORS
114 Torsten Seemann
116 Mark Johnson
118 =head1 APPENDIX
120 The rest of the documentation details each of the object methods.
121 Internal methods are usually preceded with a _
123 =cut
126 # Let the code begin...
129 package Bio::Tools::Glimmer;
130 use strict;
132 use Bio::Factory::FTLocationFactory;
133 use Bio::Tools::Prediction::Gene;
134 use Bio::Tools::Prediction::Exon;
136 use base qw(Bio::Tools::AnalysisResult);
138 sub _initialize_state {
139 my($self,@args) = @_;
141 # first call the inherited method!
142 my $make = $self->SUPER::_initialize_state(@args);
144 $self->{'_preds_parsed'} = 0;
145 # array of pre-parsed predictions
146 $self->{'_preds'} = [];
149 =head2 new
151 Title : new
152 Usage : my $obj = Bio::Tools::Glimmer->new();
153 Function: Builds a new Bio::Tools::Glimmer object
154 Returns : an instance of Bio::Tools::Glimmer
155 Args : format ('Glimmer', 'GlimmerM', 'GlimmerHMM'), seqname
158 =cut
160 sub new {
161 my($class,@args) = @_;
163 my $self = $class->SUPER::new(@args);
165 my ($format, $seqname, $seqlength, $detail) =
166 $self->_rearrange([qw(FORMAT SEQNAME SEQLENGTH DETAIL)], @args);
168 # override automagic format detection
169 if (defined($format) &&
170 (($format eq 'Glimmer') ||
171 ($format eq 'GlimmerM') ||
172 ($format eq 'GlimmerHMM'))
174 $self->_format($format);
177 if (defined($detail)) {
178 $self->_format('Glimmer');
179 $self->_detail_file($detail);
182 # hardwire seq_id when creating gene and exon objects (Glimmer 2.X)
183 $self->_seqname($seqname) if defined($seqname);
185 # store the length of the input sequence (Glimmer 2.X)
186 $self->_seqlength($seqlength) if defined($seqlength);
188 return $self;
191 =head2 analysis_method
193 Usage : $glimmer->analysis_method();
194 Purpose : Inherited method. Overridden to ensure that the name matches
195 /glimmer/i.
196 Returns : String
197 Argument : n/a
199 =cut
201 #-------------
202 sub analysis_method {
203 #-------------
204 my ($self, $method) = @_;
205 if($method && ($method !~ /glimmer/i)) {
206 $self->throw("method $method not supported in " . ref($self));
208 return $self->SUPER::analysis_method($method);
211 =head2 next_feature
213 Title : next_feature
214 Usage : while($gene = $glimmer->next_feature()) {
215 # do something
217 Function: Returns the next gene structure prediction of the Glimmer result
218 file. Call this method repeatedly until FALSE is returned.
220 The returned object is actually a SeqFeatureI implementing object.
221 This method is required for classes implementing the
222 SeqAnalysisParserI interface, and is merely an alias for
223 next_prediction() at present.
225 Example :
226 Returns : A Bio::Tools::Prediction::Gene object.
227 Args :
229 =cut
231 sub next_feature {
232 my ($self,@args) = @_;
233 # even though next_prediction doesn't expect any args (and this method
234 # does neither), we pass on args in order to be prepared if this changes
235 # ever
236 return $self->next_prediction(@args);
239 =head2 next_prediction
241 Title : next_prediction
242 Usage : while($gene = $glimmer->next_prediction()) {
243 # do something
245 Function: Returns the next gene structure prediction of the Glimmer result
246 file. Call this method repeatedly until FALSE is returned.
248 Example :
249 Returns : A Bio::Tools::Prediction::Gene object.
250 Args :
252 =cut
254 sub next_prediction {
255 my ($self) = @_;
256 my $gene;
258 # if the prediction section hasn't been parsed yet, we do this now
259 $self->_parse_predictions() unless $self->_predictions_parsed();
261 # get next gene structure
262 $gene = $self->_prediction();
263 return $gene;
266 =head2 _parse_predictions
268 Title : _parse_predictions()
269 Usage : $obj->_parse_predictions()
270 Function: Parses the prediction section. Automatically called by
271 next_prediction() if not yet done.
272 Example :
273 Returns :
275 =cut
277 sub _parse_predictions {
279 my ($self) = @_;
282 my %method = (
283 'Glimmer' => '_parse_prokaryotic',
284 'GlimmerM' => '_parse_eukaryotic',
285 'GlimmerHMM' => '_parse_eukaryotic',
286 '_DEFAULT_' => '_parse_eukaryotic',
289 my $format = $self->_format();
291 if (!$format) {
293 while (my $line = $self->_readline()) {
295 if ( $line =~ /^Glimmer\S*\s+\(Version\s*\S+\)/ ) {
296 $format = 'GlimmerM';
297 $self->_pushback($line);
298 last;
300 elsif ( $line =~ /^Glimmer\S*$/ ) {
301 $format = 'GlimmerHMM';
302 $self->_pushback($line);
303 last;
305 elsif ($line =~ /^Putative Genes:$/) {
306 $format = 'Glimmer';
307 $self->_pushback($line);
308 last;
310 elsif ($line =~ /^>(\S+)/) {
311 $format = 'Glimmer';
312 $self->_pushback($line);
313 last;
320 my $method =
321 (exists($method{$format})) ? $method{$format} : $method{'_DEFAULT_'};
323 return $self->$method();
328 =head2 _parse_eukaryotic
330 Title : _parse_eukaryotic()
331 Usage : $obj->_parse_eukaryotic()
332 Function: Parses the prediction section. Automatically called by
333 next_prediction() if not yet done.
334 Example :
335 Returns :
337 =cut
339 sub _parse_eukaryotic {
340 my ($self) = @_;
342 my ($gene,$seqname,$seqlen,$source,$lastgenenum);
344 while(defined($_ = $self->_readline())) {
345 if( /^(Glimmer\S*)\s+\(Version\s*(\S+)\)/ ) {
346 $source = "$1_$2";
347 next;
348 } elsif( /^(GlimmerHMM\S*)$/ ) { # GlimmerHMM has no version
349 $source = $1;
350 next;
351 } elsif(/^Sequence name:\s+(.+)$/ ) {
352 $seqname = $1;
353 next;
354 } elsif( /^Sequence length:\s+(\S+)/ ) {
355 $seqlen = $1;
356 next;
357 } elsif( m/^(Predicted genes)|(Gene)|\s+\#/ || /^\s+$/ ) {
358 next;
360 } elsif( # GlimmerM/HMM gene-exon prediction line
361 /^\s+(\d+)\s+ # gene num
362 (\d+)\s+ # exon num
363 ([\+\-])\s+ # strand
364 (\S+)\s+ # exon type
365 (\d+)\s+(\d+) # exon start, end
366 \s+(\d+) # exon length
367 /ox ) {
368 my ($genenum,$exonnum,$strand,$type,$start,$end,$len) =
369 ( $1,$2,$3,$4,$5,$6,$7);
370 if( ! $lastgenenum || $lastgenenum != $genenum) {
371 $self->_add_prediction($gene) if ( $gene );
372 $gene = Bio::Tools::Prediction::Gene->new
374 '-seq_id' => $seqname,
375 '-primary_tag' => "gene",
376 '-source_tag' => $source,
377 '-tag' => { 'Group' => "GenePrediction$genenum"},
380 my $exon = Bio::Tools::Prediction::Exon->new
381 ('-seq_id' => $seqname,
382 '-start' => $start,
383 '-end' => $end,
384 '-strand' => $strand eq '-' ? '-1' : '1',
385 '-source_tag' => $source,
386 '-primary_tag'=> 'exon',
387 '-tag' => { 'Group' => "GenePrediction$genenum"},
389 $gene->add_exon($exon,lc($type));
390 $lastgenenum = $genenum;
393 $self->_add_prediction($gene) if( $gene );
394 $self->_predictions_parsed(1);
397 =head2 _parse_prokaryotic
399 Title : _parse_prokaryotic()
400 Usage : $obj->_parse_prokaryotic()
401 Function: Parses the prediction section. Automatically called by
402 next_prediction() if not yet done.
403 Example :
404 Returns :
406 =cut
408 sub _parse_prokaryotic {
409 my ($self) = @_;
411 # default value, possibly overriden later
412 my $source = 'Glimmer';
414 # Store the sequence length(s) here, either from the
415 # seqlength arg to the constructor, or from the
416 # Glimmer 3.X detail file
417 my %seqlength = ( );
419 # Glimmer 2.X does not provide a sequence identifer
420 # in the prediction report (will default to unknown
421 # if not specified in the seqname arg to the
422 # constructor
424 # Glimmer 2.X does not report the length of the
425 # input sequence, either (will default to undef
426 # if not specified in the seqlength arg to the
427 # constructor
428 my $seqname = $self->_seqname();
429 my $seqlength = $self->_seqlength();
431 if (defined($seqlength)) {
432 $seqlength{$seqname} = $seqlength
435 # Parse the detail file, if we have one (Glimmer 3.X)
436 my $detail_file = $self->_detail_file();
438 if (defined($detail_file)) {
440 my $io = Bio::Root::IO->new(-file => $detail_file);
441 my $seqname;
443 while (defined($_ = $io->_readline())) {
444 if ($_ =~ /^>(\S+)/) {
445 $seqname = $1;
446 next;
449 if (defined($seqname) && ($_ =~ /^Sequence length = (\d+)$/)) {
450 $seqlength{$seqname} = $1;
451 next;
456 my $location_factory = Bio::Factory::FTLocationFactory->new();
458 while(defined($_ = $self->_readline())) {
459 # Glimmer 3.X does provide a sequence identifier -
460 # beware whitespace at the end (comes through from
461 # the fasta file)
462 if ($_ =~ /^Putative Genes:$/) {
463 $source = 'Glimmer_2.X';
464 next;
466 # Glimmer 3.X sequence identifier
467 elsif ($_ =~ /^>(\S+)/) {
468 $seqname = $1;
469 $seqlength = $seqlength{$seqname};
470 $source = 'Glimmer_3.X';
471 next;
473 elsif (
474 # Glimmer 2.X prediction
475 (/^\s+(\d+)\s+ # gene num
476 (\d+)\s+(\d+)\s+ # start, end
477 \[([\+\-])(\d{1})\s+ # strand, frame
478 /ox ) ||
479 # Glimmer 3.X prediction
480 (/^[^\d]+(\d+)\s+ # orf (numeric portion)
481 (\d+)\s+(\d+)\s+ # start, end
482 ([\+\-])(\d{1})\s+ # strand, frame
483 ([\d\.]+) # score
484 /ox)) {
485 my ($genenum,$start,$end,$strand,$frame,$score) =
486 ( $1,$2,$3,$4,$5,$6 );
488 my $circular_prediction = 0;
490 # Check for a circular prediction before we
491 # start fiddling with the coordinates
492 if ($strand eq '+') {
493 if ($start > $end) {
494 $circular_prediction = 1;
497 else {
498 if ($start < $end) {
499 $circular_prediction = 1;
503 if ($circular_prediction) {
504 unless (defined($seqlength)) {
505 $self->throw("need to know the sequence length to handle wraparound genes");
509 # Glimmer 2.X predictions do not include
510 # the stop codon - this might extend the
511 # prediction off either end of the sequence.
512 # This works fine even on circular/wraparound
513 # predictions.
514 if ($source eq 'Glimmer_2.X') {
515 if ($strand eq '+') {
516 $end += 3;
518 else {
519 $end -= 3;
523 # We might have extended a Glimmer 2.X prediction
524 # beyond the boundaries of the input sequence.
525 # Also, Glimmer 3.X (with -X) will output predictions
526 # with coordinates less than 1 or greater than the
527 # length of the sequence.
528 my ($fst, $fend);
529 foreach my $coord ($start, $end) {
530 if ($coord < 1) {
531 $coord = '<1';
532 $fst++;
533 } elsif (defined($seqlength) && ($coord > $seqlength)) {
534 $coord = ">$seqlength";
535 $fend++;
539 my $location_string;
541 if ($circular_prediction) {
542 if ($strand eq '+') {
543 $location_string = "join($start..$seqlength,1..$end)";
545 else {
546 $location_string = "join($start..1,$seqlength..$end)";
549 else {
550 # start must always be less than end for gene locations
551 if ($strand eq '-' && !$fst && !$fend && $start > $end) {
552 ($start, $end) = ($end, $start);
554 $location_string = "$start..$end";
557 my $location_object =
558 $location_factory->from_string($location_string);
560 # convert glimmer's frame range from 1-3 to SeqFeature's 0-2.
561 $frame--;
563 my $gene = Bio::SeqFeature::Generic->new
565 '-seq_id' => $seqname,
566 '-location' => $location_object,
567 '-strand' => $strand eq '-' ? '-1' : '1',
568 '-frame' => $frame,
569 '-source_tag' => $source,
570 '-display_name' => "orf$genenum",
571 '-primary_tag'=> 'gene',
572 '-tag' => { 'Group' => "GenePrediction_$genenum"},
573 '-score' => $score || undef
576 $self->_add_prediction($gene)
580 $self->_predictions_parsed(1);
583 =head2 _prediction
585 Title : _prediction()
586 Usage : $gene = $obj->_prediction()
587 Function: internal
588 Example :
589 Returns :
591 =cut
593 sub _prediction {
594 my ($self) = @_;
596 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
597 return shift(@{$self->{'_preds'}});
600 =head2 _add_prediction
602 Title : _add_prediction()
603 Usage : $obj->_add_prediction($gene)
604 Function: internal
605 Example :
606 Returns :
608 =cut
610 sub _add_prediction {
611 my ($self, $gene) = @_;
613 if(! exists($self->{'_preds'})) {
614 $self->{'_preds'} = [];
616 push(@{$self->{'_preds'}}, $gene);
619 =head2 _predictions_parsed
621 Title : _predictions_parsed
622 Usage : $obj->_predictions_parsed
623 Function: internal
624 Example :
625 Returns : TRUE or FALSE
627 =cut
629 sub _predictions_parsed {
630 my ($self, $val) = @_;
632 $self->{'_preds_parsed'} = $val if $val;
633 if(! exists($self->{'_preds_parsed'})) {
634 $self->{'_preds_parsed'} = 0;
636 return $self->{'_preds_parsed'};
639 =head2 _seqname
641 Title : _seqname
642 Usage : $obj->_seqname($seqname)
643 Function: internal (for Glimmer 2.X)
644 Example :
645 Returns : String
647 =cut
649 sub _seqname {
650 my ($self, $val) = @_;
652 $self->{'_seqname'} = $val if $val;
653 if(! exists($self->{'_seqname'})) {
654 $self->{'_seqname'} = 'unknown';
656 return $self->{'_seqname'};
659 =head2 _seqlength
661 Title : _seqlength
662 Usage : $obj->_seqlength($seqlength)
663 Function: internal (for Glimmer 2.X)
664 Example :
665 Returns : String
667 =cut
669 sub _seqlength {
670 my ($self, $val) = @_;
672 $self->{'_seqlength'} = $val if $val;
673 return $self->{'_seqlength'};
676 =head2 _format
678 Title : _format
679 Usage : $obj->_format($format)
680 Function: internal
681 Example :
682 Returns : String
684 =cut
686 sub _format {
687 my ($self, $val) = @_;
689 $self->{'_format'} = $val if $val;
691 return $self->{'_format'};
694 =head2 _detail_file
696 Title : _detail_file
697 Usage : $obj->_detail_file($filename)
698 Function: internal (for Glimmer 3.X)
699 Example :
700 Returns : String
702 =cut
704 sub _detail_file {
705 my ($self, $val) = @_;
707 $self->{'_detail_file'} = $val if $val;
708 return $self->{'_detail_file'};