2 # BioPerl module for Bio::Tools::Glimmer
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::Glimmer - parser for Glimmer 2.X/3.X prokaryotic and
17 GlimmerM/GlimmerHMM eukaryotic gene predictions
21 use Bio::Tools::Glimmer;
24 my $parser = Bio::Tools::Glimmer->new(-file => $file);
26 $parser = Bio::Tools::Glimmer->new( -fh => \*INPUT );
27 # provide a sequence identifier (Glimmer 2.X)
28 my $parser = Bio::Tools::Glimmer->new(-file => $file, -seqname => seqname);
29 # force format (override automatic detection)
30 my $parser = Bio::Tools::Glimmer->new(-file => $file, -format => 'GlimmerM');
33 # note: this class is-a Bio::Tools::AnalysisResult which implements
34 # Bio::SeqAnalysisParserI, i.e., $glimmer->next_feature() is the same
36 while(my $gene = $parser->next_prediction()) {
37 # For eukaryotic input (GlimmerM/GlimmerHMM), $gene will be an instance
38 # of Bio::Tools::Prediction::Gene, which inherits off
39 # Bio::SeqFeature::Gene::Transcript, and $gene->exons() will return an
40 # array of Bio::Tools::Prediction::Exon objects.
41 # For prokaryotic input (Glimmer2.X/Glimmer3.X), $gene will be an
42 # instance of Bio::SeqFeature::Generic
44 # all exons (eukaryotic only):
45 @exon_arr = $gene->exons();
47 @init_exons = $gene->exons('Initial');
49 @intrl_exons = $gene->exons('Internal');
51 @term_exons = $gene->exons('Terminal');
56 This is a module for parsing Glimmer, GlimmerM and GlimmerHMM predictions.
57 It will create gene objects from the prediction report which can
58 be attached to a sequence using Bioperl objects, or output as GFF
59 suitable for loading into Bio::DB::GFF for use with Gbrowse.
61 Glimmer is open source and available at
62 L<http://www.cbcb.umd.edu/software/glimmer/>.
64 GlimmerM is open source and available at
65 L<http://www.tigr.org/software/glimmerm/>.
67 GlimmerHMM is open source and available at
68 L<http://www.cbcb.umd.edu/software/GlimmerHMM/>.
70 Note that Glimmer 2.X will only process the first
71 sequence in a fasta file, and the prediction report does not contain any
72 sort of sequence identifier
74 Note that Glimmer 3.X produces two output files. This module only parses
82 User feedback is an integral part of the evolution of this and other
83 Bioperl modules. Send your comments and suggestions preferably to
84 the Bioperl mailing list. Your participation is much appreciated.
86 bioperl-l@bioperl.org - General discussion
87 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
91 Please direct usage questions or support issues to the mailing list:
93 I<bioperl-l@bioperl.org>
95 rather than to the module maintainer directly. Many experienced and
96 reponsive experts will be able look at the problem and quickly
97 address it. Please include a thorough description of the problem
98 with code and data examples if at all possible.
100 =head2 Reporting Bugs
102 Report bugs to the Bioperl bug tracking system to help us keep track
103 of the bugs and their resolution. Bug reports can be submitted via
106 https://github.com/bioperl/bioperl-live/issues
108 =head1 AUTHOR - Jason Stajich
110 Email jason-at-bioperl-dot-org
120 The rest of the documentation details each of the object methods.
121 Internal methods are usually preceded with a _
126 # Let the code begin...
129 package Bio
::Tools
::Glimmer
;
132 use Bio
::Factory
::FTLocationFactory
;
133 use Bio
::Tools
::Prediction
::Gene
;
134 use Bio
::Tools
::Prediction
::Exon
;
136 use base
qw(Bio::Tools::AnalysisResult);
138 sub _initialize_state
{
139 my($self,@args) = @_;
141 # first call the inherited method!
142 my $make = $self->SUPER::_initialize_state
(@args);
144 $self->{'_preds_parsed'} = 0;
145 # array of pre-parsed predictions
146 $self->{'_preds'} = [];
152 Usage : my $obj = Bio::Tools::Glimmer->new();
153 Function: Builds a new Bio::Tools::Glimmer object
154 Returns : an instance of Bio::Tools::Glimmer
155 Args : format ('Glimmer', 'GlimmerM', 'GlimmerHMM'), seqname
161 my($class,@args) = @_;
163 my $self = $class->SUPER::new
(@args);
165 my ($format, $seqname, $seqlength, $detail) =
166 $self->_rearrange([qw(FORMAT SEQNAME SEQLENGTH DETAIL)], @args);
168 # override automagic format detection
169 if (defined($format) &&
170 (($format eq 'Glimmer') ||
171 ($format eq 'GlimmerM') ||
172 ($format eq 'GlimmerHMM'))
174 $self->_format($format);
177 if (defined($detail)) {
178 $self->_format('Glimmer');
179 $self->_detail_file($detail);
182 # hardwire seq_id when creating gene and exon objects (Glimmer 2.X)
183 $self->_seqname($seqname) if defined($seqname);
185 # store the length of the input sequence (Glimmer 2.X)
186 $self->_seqlength($seqlength) if defined($seqlength);
191 =head2 analysis_method
193 Usage : $glimmer->analysis_method();
194 Purpose : Inherited method. Overridden to ensure that the name matches
202 sub analysis_method
{
204 my ($self, $method) = @_;
205 if($method && ($method !~ /glimmer/i)) {
206 $self->throw("method $method not supported in " . ref($self));
208 return $self->SUPER::analysis_method
($method);
214 Usage : while($gene = $glimmer->next_feature()) {
217 Function: Returns the next gene structure prediction of the Glimmer result
218 file. Call this method repeatedly until FALSE is returned.
220 The returned object is actually a SeqFeatureI implementing object.
221 This method is required for classes implementing the
222 SeqAnalysisParserI interface, and is merely an alias for
223 next_prediction() at present.
226 Returns : A Bio::Tools::Prediction::Gene object.
232 my ($self,@args) = @_;
233 # even though next_prediction doesn't expect any args (and this method
234 # does neither), we pass on args in order to be prepared if this changes
236 return $self->next_prediction(@args);
239 =head2 next_prediction
241 Title : next_prediction
242 Usage : while($gene = $glimmer->next_prediction()) {
245 Function: Returns the next gene structure prediction of the Glimmer result
246 file. Call this method repeatedly until FALSE is returned.
249 Returns : A Bio::Tools::Prediction::Gene object.
254 sub next_prediction
{
258 # if the prediction section hasn't been parsed yet, we do this now
259 $self->_parse_predictions() unless $self->_predictions_parsed();
261 # get next gene structure
262 $gene = $self->_prediction();
266 =head2 _parse_predictions
268 Title : _parse_predictions()
269 Usage : $obj->_parse_predictions()
270 Function: Parses the prediction section. Automatically called by
271 next_prediction() if not yet done.
277 sub _parse_predictions
{
283 'Glimmer' => '_parse_prokaryotic',
284 'GlimmerM' => '_parse_eukaryotic',
285 'GlimmerHMM' => '_parse_eukaryotic',
286 '_DEFAULT_' => '_parse_eukaryotic',
289 my $format = $self->_format();
293 while (my $line = $self->_readline()) {
295 if ( $line =~ /^Glimmer\S*\s+\(Version\s*\S+\)/ ) {
296 $format = 'GlimmerM';
297 $self->_pushback($line);
300 elsif ( $line =~ /^Glimmer\S*$/ ) {
301 $format = 'GlimmerHMM';
302 $self->_pushback($line);
305 elsif ($line =~ /^Putative Genes:$/) {
307 $self->_pushback($line);
310 elsif ($line =~ /^>(\S+)/) {
312 $self->_pushback($line);
321 (exists($method{$format})) ?
$method{$format} : $method{'_DEFAULT_'};
323 return $self->$method();
328 =head2 _parse_eukaryotic
330 Title : _parse_eukaryotic()
331 Usage : $obj->_parse_eukaryotic()
332 Function: Parses the prediction section. Automatically called by
333 next_prediction() if not yet done.
339 sub _parse_eukaryotic
{
342 my ($gene,$seqname,$seqlen,$source,$lastgenenum);
344 while(defined($_ = $self->_readline())) {
345 if( /^(Glimmer\S*)\s+\(Version\s*(\S+)\)/ ) {
348 } elsif( /^(GlimmerHMM\S*)$/ ) { # GlimmerHMM has no version
351 } elsif(/^Sequence name:\s+(.+)$/ ) {
354 } elsif( /^Sequence length:\s+(\S+)/ ) {
357 } elsif( m/^(Predicted genes)|(Gene)|\s+\#/ || /^\s+$/ ) {
360 } elsif( # GlimmerM/HMM gene-exon prediction line
361 /^\s
+(\d
+)\s
+ # gene num
365 (\d
+)\s
+(\d
+) # exon start, end
366 \s
+(\d
+) # exon length
368 my ($genenum,$exonnum,$strand,$type,$start,$end,$len) =
369 ( $1,$2,$3,$4,$5,$6,$7);
370 if( ! $lastgenenum || $lastgenenum != $genenum) {
371 $self->_add_prediction($gene) if ( $gene );
372 $gene = Bio
::Tools
::Prediction
::Gene
->new
374 '-seq_id' => $seqname,
375 '-primary_tag' => "gene",
376 '-source_tag' => $source,
377 '-tag' => { 'Group' => "GenePrediction$genenum"},
380 my $exon = Bio
::Tools
::Prediction
::Exon
->new
381 ('-seq_id' => $seqname,
384 '-strand' => $strand eq '-' ?
'-1' : '1',
385 '-source_tag' => $source,
386 '-primary_tag'=> 'exon',
387 '-tag' => { 'Group' => "GenePrediction$genenum"},
389 $gene->add_exon($exon,lc($type));
390 $lastgenenum = $genenum;
393 $self->_add_prediction($gene) if( $gene );
394 $self->_predictions_parsed(1);
397 =head2 _parse_prokaryotic
399 Title : _parse_prokaryotic()
400 Usage : $obj->_parse_prokaryotic()
401 Function: Parses the prediction section. Automatically called by
402 next_prediction() if not yet done.
408 sub _parse_prokaryotic
{
411 # default value, possibly overriden later
412 my $source = 'Glimmer';
414 # Store the sequence length(s) here, either from the
415 # seqlength arg to the constructor, or from the
416 # Glimmer 3.X detail file
419 # Glimmer 2.X does not provide a sequence identifer
420 # in the prediction report (will default to unknown
421 # if not specified in the seqname arg to the
424 # Glimmer 2.X does not report the length of the
425 # input sequence, either (will default to undef
426 # if not specified in the seqlength arg to the
428 my $seqname = $self->_seqname();
429 my $seqlength = $self->_seqlength();
431 if (defined($seqlength)) {
432 $seqlength{$seqname} = $seqlength
435 # Parse the detail file, if we have one (Glimmer 3.X)
436 my $detail_file = $self->_detail_file();
438 if (defined($detail_file)) {
440 my $io = Bio
::Root
::IO
->new(-file
=> $detail_file);
443 while (defined($_ = $io->_readline())) {
444 if ($_ =~ /^>(\S+)/) {
449 if (defined($seqname) && ($_ =~ /^Sequence length = (\d+)$/)) {
450 $seqlength{$seqname} = $1;
456 my $location_factory = Bio
::Factory
::FTLocationFactory
->new();
458 while(defined($_ = $self->_readline())) {
459 # Glimmer 3.X does provide a sequence identifier -
460 # beware whitespace at the end (comes through from
462 if ($_ =~ /^Putative Genes:$/) {
463 $source = 'Glimmer_2.X';
466 # Glimmer 3.X sequence identifier
467 elsif ($_ =~ /^>(\S+)/) {
469 $seqlength = $seqlength{$seqname};
470 $source = 'Glimmer_3.X';
474 # Glimmer 2.X prediction
475 (/^\s
+(\d
+)\s
+ # gene num
476 (\d
+)\s
+(\d
+)\s
+ # start, end
477 \
[([\
+\
-])(\d
{1})\s
+ # strand, frame
479 # Glimmer 3.X prediction
480 (/^[^\d
]+(\d
+)\s
+ # orf (numeric portion)
481 (\d
+)\s
+(\d
+)\s
+ # start, end
482 ([\
+\
-])(\d
{1})\s
+ # strand, frame
485 my ($genenum,$start,$end,$strand,$frame,$score) =
486 ( $1,$2,$3,$4,$5,$6 );
488 my $circular_prediction = 0;
490 # Check for a circular prediction before we
491 # start fiddling with the coordinates
492 if ($strand eq '+') {
494 $circular_prediction = 1;
499 $circular_prediction = 1;
503 if ($circular_prediction) {
504 unless (defined($seqlength)) {
505 $self->throw("need to know the sequence length to handle wraparound genes");
509 # Glimmer 2.X predictions do not include
510 # the stop codon - this might extend the
511 # prediction off either end of the sequence.
512 # This works fine even on circular/wraparound
514 if ($source eq 'Glimmer_2.X') {
515 if ($strand eq '+') {
523 # We might have extended a Glimmer 2.X prediction
524 # beyond the boundaries of the input sequence.
525 # Also, Glimmer 3.X (with -X) will output predictions
526 # with coordinates less than 1 or greater than the
527 # length of the sequence.
529 foreach my $coord ($start, $end) {
533 } elsif (defined($seqlength) && ($coord > $seqlength)) {
534 $coord = ">$seqlength";
541 if ($circular_prediction) {
542 if ($strand eq '+') {
543 $location_string = "join($start..$seqlength,1..$end)";
546 $location_string = "join($start..1,$seqlength..$end)";
550 # start must always be less than end for gene locations
551 if ($strand eq '-' && !$fst && !$fend && $start > $end) {
552 ($start, $end) = ($end, $start);
554 $location_string = "$start..$end";
557 my $location_object =
558 $location_factory->from_string($location_string);
560 # convert glimmer's frame range from 1-3 to SeqFeature's 0-2.
563 my $gene = Bio
::SeqFeature
::Generic
->new
565 '-seq_id' => $seqname,
566 '-location' => $location_object,
567 '-strand' => $strand eq '-' ?
'-1' : '1',
569 '-source_tag' => $source,
570 '-display_name' => "orf$genenum",
571 '-primary_tag'=> 'gene',
572 '-tag' => { 'Group' => "GenePrediction_$genenum"},
573 '-score' => $score || undef
576 $self->_add_prediction($gene)
580 $self->_predictions_parsed(1);
585 Title : _prediction()
586 Usage : $gene = $obj->_prediction()
596 return unless(exists($self->{'_preds'}) && @
{$self->{'_preds'}});
597 return shift(@
{$self->{'_preds'}});
600 =head2 _add_prediction
602 Title : _add_prediction()
603 Usage : $obj->_add_prediction($gene)
610 sub _add_prediction
{
611 my ($self, $gene) = @_;
613 if(! exists($self->{'_preds'})) {
614 $self->{'_preds'} = [];
616 push(@
{$self->{'_preds'}}, $gene);
619 =head2 _predictions_parsed
621 Title : _predictions_parsed
622 Usage : $obj->_predictions_parsed
625 Returns : TRUE or FALSE
629 sub _predictions_parsed
{
630 my ($self, $val) = @_;
632 $self->{'_preds_parsed'} = $val if $val;
633 if(! exists($self->{'_preds_parsed'})) {
634 $self->{'_preds_parsed'} = 0;
636 return $self->{'_preds_parsed'};
642 Usage : $obj->_seqname($seqname)
643 Function: internal (for Glimmer 2.X)
650 my ($self, $val) = @_;
652 $self->{'_seqname'} = $val if $val;
653 if(! exists($self->{'_seqname'})) {
654 $self->{'_seqname'} = 'unknown';
656 return $self->{'_seqname'};
662 Usage : $obj->_seqlength($seqlength)
663 Function: internal (for Glimmer 2.X)
670 my ($self, $val) = @_;
672 $self->{'_seqlength'} = $val if $val;
673 return $self->{'_seqlength'};
679 Usage : $obj->_format($format)
687 my ($self, $val) = @_;
689 $self->{'_format'} = $val if $val;
691 return $self->{'_format'};
697 Usage : $obj->_detail_file($filename)
698 Function: internal (for Glimmer 3.X)
705 my ($self, $val) = @_;
707 $self->{'_detail_file'} = $val if $val;
708 return $self->{'_detail_file'};