5 Bio::FeatureIO::geneseqer - FeatureIO parser for GeneSeqer reports
9 my $feature_in = Bio::FeatureIO->new(-format => 'geneseqer',
10 -file => 'myfile.geneseqer.out',
13 $feature_in->attach_alignments(1); #attach alignments to their AGSs
15 $feature_in->mode('both_separate'); #parse both alignments and PGLs from the
18 while(my $feat = $feature_in->next_alignment) {
19 #do something with alignment feature
22 while(my $feat = $feature_in->next_feature) {
24 #get the alignment objects attached to the PGL feature
28 } $feat->annotation->get_Annotations('supporting_alignment');
30 foreach my $alignment (@alignments) {
31 # $alignment is now a feature representing one of the EST
32 # alignments used to compute this PGL
40 Parses GeneSeqer reports, returning L<Bio::SeqFeature::Annotated>
41 objects. Alignments are represented by a
42 L<Bio::SeqFeature::Annotated> object with a 'Target' annotation,
43 holding a L<Bio::Annotation::Target> object. PGLs are also
44 L<Bio::SeqFeature::Annotated> objects.
46 The mode() accessor controls the parsing mode, which is one of:
52 Only predicted gene location (PGL) features will be parsed and
53 returned. Supporting alignments in the file will be ignored.
57 Only supporting EST/protein alignments will be returned, PGLs will be
62 Both PGLs and alignments will be parsed. next_feature() will return
63 PGLs only, and next_alignment() will return the alignments
67 Both PGLs and alignments will be parsed. next_feature() will return
68 B<both> PGLs and alignments. next_alignment will always return undef
72 If the parse mode is set to 'both_separate' or 'both_merged', you can
73 also set the accessor attach_alignments() to have the supporting
74 alignment features attached to the AGS features in which they are
75 used. These are attached under the Annotation key/tag
76 'supporting_alignment'. This is off by default.
78 In the future, this module may be enhanced to have the capability of
79 producing L<Bio::SeqFeature::Generic> and
80 L<Bio::SeqFeature::FeaturePair> objects.
82 =head1 STRUCTURE OF RETURNED FEATURES
84 This module returns a bunch of hierarchical features (that is, the
85 features have subfeatures) that represent the GeneSeqer output. The
86 feature hierarchy for PGLs is arranged as:
88 region - the GeneSeqer predicted gene location (PGL)
89 |- processed_transcript - each alternative gene structure (AGS) in the PGL
90 | |- CDS - each exon in that AGS
91 |- (optional) match - alignment, see below
94 and for alignments it is
96 match - spliced alignment of an EST/protein to the ref seq
97 |- match_part - each exon in the spliced alignment
99 Note that the alignments (match and match_part features) will only be
100 attached to the gene features if attach_alignments() is set to true.
102 =head1 REDUCING MEMORY REQUIREMENTS
104 If you find yourself running out of memory while parsing large
105 GeneSeqer output files with this parser, there are a couple of things
106 you can do to decrease the memory requirements:
108 1. do not use the 'both_separate' parsing mode, since it can sometimes
109 need to keep a large buffer of features in memory
110 2. make sure attach_alignments() is set to false (0 or '')
112 If these two measures are not adequate for you, please explain your
113 problem on the mailing list, and we may be able to address your
120 User feedback is an integral part of the evolution of this and other
121 Bioperl modules. Send your comments and suggestions preferably to
122 the Bioperl mailing list. Your participation is much appreciated.
124 bioperl-l@bioperl.org - General discussion
125 http://bioperl.org/MailList.shtml - About the mailing lists
127 =head2 Reporting Bugs
129 Report bugs to the Bioperl bug tracking system to help us keep track
130 of the bugs and their resolution. Bug reports can be submitted via
133 http://bugzilla.bioperl.org/
137 Robert Buels, rmb32@cornell.edu
141 The rest of the documentation details each of the object methods.
142 Internal methods are usually preceded with a _
146 # Let the code begin...
148 package Bio
::FeatureIO
::geneseqer
;
151 use base
qw(Bio::FeatureIO);
153 use Bio
::SeqFeature
::Annotated
;
157 constructor options for this parser are:
160 takes same arguments as mode() accessor below
162 takes same arguments as attach_alignments() accessor below
169 $self->SUPER::_initialize
(%arg);
172 $self->{alignment_buffer
} = [];
173 $self->{feature_buffer
} = [];
176 $arg{-mode
} ||= 'pgls';
177 $arg{-attach_alignments
} = 0 unless defined $arg{-attach_alignments
};
180 $self->mode($arg{-mode
});
181 $self->attach_alignments($arg{-attach_alignments
});
186 Usage: $fio->mode('alignments');
187 #parse and return only alignments
188 Desc : get/set the current parsing mode. Defaults to
190 Ret : currently set parse mode
191 Args : (optional) new parse mode, which is one of 'pgls',
192 'alignments', 'both_separate', or 'both_merged'
197 my ($self,$newmode) = @_;
198 if(defined $newmode) {
199 grep {$_ eq $newmode} qw
/alignments pgls both_separate both_merged/
200 or $self->throw("invalid mode selection $newmode");
201 $self->{parse_mode
} = $newmode;
203 return $self->{parse_mode
};
206 =head2 attach_alignments()
208 Usage: $fio->attach_alignments(1);
209 Desc : get/set a flag telling the parser whether to attach
210 the supporting alignment features to each AGS (gene
213 Attaches the alignments under a tag/Annotation
214 named 'supporting_alignment'.
215 Ret : current value of this flag.
216 Args : (optional) new true/false value of this flag
220 sub attach_alignments
{
221 my ($self,$newval) = @_;
222 if(defined $newval) {
223 $self->{attach_alignments
} = $newval;
225 return $self->{attach_alignments
};
228 =head2 next_feature()
230 Usage : my $feature = $featureio->next_feature();
231 Function: reads the next Predicted Gene Location from the given
232 geneseqer file, returning it as an object. The PGLs
233 will be returned in the same order in which they appear
234 in the GeneSeqer output.
235 Returns : a predicted gene location (PGL) feature, of class
236 L<Bio::SeqFeature::Annotated>
242 # return 'pgl/alignment' if in buffer
243 # else buffer 'pgl/alignment' and return it
245 my ($self,$featuretype) = shift;
246 $featuretype ||= 'pgl';
248 #validate attach_alignments and mode selections
249 if( $self->attach_alignments && $self->mode !~ /^both/ ) {
250 $self->throw("mode must be set to 'both_merged' or 'both_separate' if attach_alignments() is set to true");
253 return $self->next_alignment if $featuretype eq 'alignment';
255 $self->throw("invalid featuretype '$featuretype'") unless $featuretype eq 'pgl';
257 #now down to business
259 #get a pgl into the buffer if necessary
260 unless( @
{$self->{feature_buffer
}} ) {
261 $self->_buffer_item($self->mode eq 'both_merged' ?
'either' : 'pgl');
264 return shift @
{$self->{feature_buffer
}};
267 =head2 next_alignment()
269 Usage: my $feature = $fio->next_alignment
270 Desc : get the next EST/protein alignment feature from the
271 GeneSeqer report. The alignments will be returned in
272 the same order in which they appear in the file.
273 Ret : an alignment feature, or undef if there are no more
279 # alias for next_feature('alignment')
283 #validate attach_alignments and mode selections
284 if( $self->attach_alignments && $self->mode !~ /^both_/ ) {
285 $self->throw("mode must be set to 'both_separate' or 'both_merged' if attach_alignments() is set to true");
288 #get an alignment into the buffer if necessary
289 unless( @
{$self->{alignment_buffer
}} ) {
290 $self->_buffer_item('alignment');
293 return shift @
{$self->{alignment_buffer
}};
296 =head2 write_feature()
303 shift->throw_not_implemented;
306 #central repository for match patterns
308 my ($self,$patname) = @_;
311 pgl
=> qr/^\s*PGL\s+\d+\s+\(/,
312 ags
=> qr/^\s*AGS-\d+\s/,
313 sequence
=> qr/^\s*Sequence\s+\d+\s*:\s+(.+),\s+from/,
314 align
=> qr/^\s*\*{10}/,
315 finished
=> qr/^[^A-Za-z]+finished\s/,
317 return $patterns{$patname};
320 # buffer_item (central parsing loop)
321 # read from file. if mode says we're using it, parse the thing and buffer it.
322 # otherwise, **skip** it
323 # return 1 when you finally buffer a thing of the type your arg says you're looking for
326 my ($self,$requested_type) = @_;
327 grep {$requested_type eq $_} qw
/ alignment pgl either /
328 or $self->throw("invalid requested type '$requested_type'");
330 ### MAIN PARSING LOOP detects the beginnings of sections, pushes
331 # them back on the buffer, then calls the appropriate parsing
332 # routine for them. returns only if a.) it successfully finds and
333 # parses an item of the requested type or b.) it reaches EOF
334 while( my $line = $self->_readline ) {
336 # warn "decide line $.: $line\n";
337 #ifs are sorted in decreasing order of how common a case they are
338 if( $self->mode ne 'pgls' && $line =~ $self->_pattern('align') ) {
339 $self->_pushback($line);
340 $self->_buffer_alignment;
341 return 1 if $requested_type eq 'alignment' || $requested_type eq 'either';
344 elsif( $self->mode ne 'alignments' && $line =~ $self->_pattern('pgl') ) {
345 $self->_pushback($line);
347 return 1 if $requested_type eq 'pgl' || $requested_type eq 'either';
350 elsif( $line =~ $self->_pattern('sequence') ) {
351 $self->_pushback($line);
352 $self->_purge_alignment_index;
353 $self->_parse_sequence_section;
359 # assumes starting at the beginning of alignment section.
360 # parse the alignment section, and put the alignment feature in the alignment buffer
361 # if we're attaching, also index the alignment by its signature
363 #parse an 'EST sequence' section of the report, up to and
364 #including the '^MATCH ' line, returning a feature with
365 #subfeatures representing the spliced cDNA alignment
366 sub _buffer_alignment
{
369 my $buffer_for_alignments = $self->mode eq 'both_merged'
370 ?
$self->{feature_buffer
}
371 : $self->{alignment_buffer
};
373 #these uninitialized variables are filled in by the parsing below
377 my $cstrand; #strandedness of the EST we're currently looking at, either '+' or '-'
378 my $cid; #identifier of the EST we're looking at
379 my $clength; #length of the EST we're looking at
382 hqpgs
=> qr/^\s*hqPGS_(\S+_\S+[+-])/,
383 est_seq
=> qr/^\s*EST\s+sequence\s+\d+\s+([+-])\s*strand\s+(?:(\d+)\s+n\s+)?\(\s*File:\s*([^\)\s]+)\s*\)/,
384 exon
=> qr/^\s+Exon\s/,
386 seqline
=> qr/^\s+(\d+)\s+([ACTG ]+)/,
389 while( my $line = $self->_readline ) {
391 # warn "cdna parsing '$line'\n";
392 #EST sequence introduction
393 if( $line =~ $patterns{est_seq
} ) {
394 # warn "cdna est line\n";
395 #parse out the strand, length, and ID of the EST we're dealing with,
396 #storing them in the lexicals above
400 $cstrand eq chop $cid or $self->throw("inconsistent strandedness in line '$line'");
401 $cstrand = $cstrand eq '+' ?
1 :
402 $cstrand eq '-' ?
-1 : $self->throw("Unknown strand direction '$cstrand'");
404 #do a little error checking
405 $cid or $self->throw("can't parse sequence identifier from line '$line'");
406 $cstrand == 1 or $cstrand == -1 or $self->throw("can't parse EST alignment strand from line '$line'");
408 #try to figure out the est's length from the sequence lines
409 #if it wasn't given in the 'EST sequence' line
410 elsif( $line =~ $patterns{seqline
} ) {
411 my $startlength = $1;
413 $residues =~ s/\s//g;
414 my $newlength = $startlength + length($residues) - 1;
415 unless($clength >= $newlength) {
416 $clength = $newlength;
418 # warn "faked up clength $clength from line '$line'\n";
422 elsif( $line =~ $patterns{exon
} ) {
423 #parse out the genomic and cDNA start,end,length, and the score of the match
424 my (undef,$gstart,$gend,undef,$cstart,$cend,undef,$exon_score)
425 = _parse_numbers
($line);
427 #if we're looking at a reverse-complemented EST, un-reverse-complement its start and end coordinates
428 unless($cstrand == 1) {
429 $cstrand == -1 or $self->throw("invalid strandedness '$cstrand'");
430 $clength > 0 or $self->throw("can't parse EST length from line '$line'");
431 ($cstart,$cend) = _reverse_complement_coords
($cstart,$cend,$clength);
434 my $subfeature = $self->_new_feature( -start
=> $gstart,
436 -score
=> $exon_score,
437 -type
=> 'match_part',
438 -source
=> 'GeneSeqer',
439 -target
=> { -target_id
=> $cid,
444 ); #note: we'll go through
446 #later, when we get it
448 push @subfeatures,$subfeature;
451 elsif( $line =~ $patterns{match
} ) {
452 my (undef,$genomic_seqname,$cdna_seqname,$alignment_score) = split /\s+/,$line;
453 my $gstrand = _chop_strand
(\
$genomic_seqname);
454 my $other_cstrand = _chop_strand
(\
$cdna_seqname);
455 $cstrand eq $other_cstrand or $self->throw("inconsistent cDNA strandedness: $cstrand vs $other_cstrand");
457 #go back over the subfeatures and error check, figure out the
458 #parent feature start and end, and set the appropriate seq_id on
462 foreach my $subfeature (@subfeatures) {
463 #check that the strandedness is correct
464 $subfeature->strand eq $gstrand or $self->throw("parser bug, inconsistent strands found ('".$subfeature->strand."' vs '$gstrand')");
465 #set the proper seq_id
466 $subfeature->seq_id($genomic_seqname);
467 #set parent feature limits
468 $gstart = $subfeature->start unless $gstart && $gstart < $subfeature->start;
469 $gend = $subfeature->end unless $gend && $gend > $subfeature->end;
472 #make an overall feature for this EST alignment
473 $parent_feature = $self->_new_feature( -start
=> $gstart,
475 -seq_id
=> $genomic_seqname,
476 -source
=> 'GeneSeqer',
477 -score
=> $alignment_score,
482 #add all the subfeatures to it, reversing so they'll come out in the right order
483 $parent_feature->add_SeqFeature($_) foreach @subfeatures;
485 elsif( $line =~ $patterns{hqpgs
} ) {
487 #do some consistency checks with the contents of the PGS line
488 my (undef,$other_cid) = split /_/, $1;
490 if( $self->attach_alignments ) {
491 #now make the pgs signature string, and return it with the parent feature we made
492 my (undef,@pgs_stuff) = split /\s+/,$line;
494 #$pgs_signature a string GeneSeqer uses to uniquely identify
495 #this alignment. We'll use these later to connect the PGLs to
496 #their supporting alignments
497 my $pgs_signature = join(' ',@pgs_stuff,$other_cid);
498 # $parent_feature->add_Annotation('comment',Bio::Annotation::Comment->new(-text => $pgs_signature));
499 $self->_index_alignment($pgs_signature,$parent_feature);
502 $parent_feature or $self->throw("parse error");
504 push @
$buffer_for_alignments, $parent_feature;
510 #parse the section of the report introducing the processing of a new
511 #genomic sequence. right now, just par
512 sub _parse_sequence_section
{
514 my $line = $self->_readline;
516 #parse the sequence identifier out of the line
517 $line =~ $self->_pattern('sequence')
518 or $self->throw("improper call of _parse_sequence_section, line '$line' does not match expected pattern");
520 # warn "got new seq $1\n";
522 #store the sequence identifier in the current object.
523 #_new_feature() uses this as the seq_id to return
524 $self->{current_genomic_sequence
} = $1
525 or $self->throw("improper 'sequence' pattern in geneseqer parser. it does not capture the sequence identifier from the Sequence line in \$1");
529 # assumes starting at the beginning of pgl section
530 # parse the pgl section, put the pgl feature in the pgl buffer
531 # if attach_alignments() is set, remember to attach the associated
532 # alignments, getting them from the alignment index
536 while(my $line = $self->_readline) {
538 # warn "pgl parse $.: $line\n";
539 if( $line =~ $self->_pattern('pgl')) {
541 my (undef,$pgl_start,$pgl_end) = _parse_numbers
($line);
542 my @ags; # list of AGSs in the current PGL. filled in by the
544 my %alignments; #this is a hash by signature of alignments that
545 #are members of this PGL
546 while( $line = $self->_readline) {
547 if( $line =~ $self->_pattern('ags') ) {
549 my ($ags_start) = $line =~ /\(\s*(\d+)/ or $self->throw("parse error");
550 my ($ags_end) = $line =~ /(\d+)\s*\)/ or $self->throw("parse error");
551 my %subfeatures = $self->_parse_ags_subfeatures;
553 push @ags,$self->_new_feature( -start
=> $ags_start,
555 -type
=> 'processed_transcript',
556 -subfeatures
=> $subfeatures{exon_features
},
557 supporting_alignments
=> $subfeatures{alignment_features
},
560 elsif( $line =~ $self->_pattern('pgl') || $line =~ $self->_pattern('finished')) {
561 $self->_pushback($line);
566 @ags or $self->throw("no ags found for pgl. premature end of file?");
568 push @
{$self->{feature_buffer
}},
569 $self->_new_feature( -start
=> $pgl_start,
572 -subfeatures
=> [@ags, values %alignments],
577 $self->throw("parse error, cannot find end of PGL section");
580 #parse a single AGS entry, making subfeatures for their exons, etc
581 #return a hash-style list as exon_features => [feat,feat...],
582 # alignment_features => { signature => feat, signature => feat, ...},
583 sub _parse_ags_subfeatures
{
586 #these variables are all filled in by the parsing below
588 my @supporting_alignment_features;
590 while( my $line = $self->_readline ) {
591 if( $line =~ /^\s*Exon\s/ ) {
592 my (undef,$gstart,$gend,undef,$exon_score) = _parse_numbers
($line);
593 push @exon_features, $self->_new_feature( -start
=> $gstart,
595 -score
=> $exon_score,
599 elsif( $line =~ /^\s*PGS\s+\(/ ) {
600 if( $self->attach_alignments ) {
601 #parse all the PGS's and then break out of the while loop above when done
603 my (undef,@pgs_tokens) = grep {$_} (split /\s+/,$line);
605 # die "got tokens: ",Dumper(\@pgs_tokens);
606 my $pgs_sig = join ' ',@pgs_tokens;
607 push @supporting_alignment_features,$self->_get_indexed_alignment($pgs_sig)
608 || $self->throw("geneseqer parse error: no supporting alignment found with signature '$pgs_sig', line was '$line'");
609 } while ( $line = $self->_readline and $line =~ /^\s*PGS\s+\(/ );
611 $self->_pushback($line);
613 #after the supporting PGS's, we are done parsing this AGS
619 return ( exon_features
=> \
@exon_features,
620 alignment_features
=> \
@supporting_alignment_features,
624 #object method to create a new feature object, with some defaults and
625 #automation of the more repetitive bits (like adding targets and
627 sub _new_feature
(@
) {
631 # warn "got feature: ",Dumper(\%a);
633 $a{-seq_id
} ||= $self->{current_genomic_sequence
};
635 #replace spaces in source with underscores
636 $a{-source
} ||= 'GeneSeqer';
637 $a{-source
} =~ s/\s+/_/g;
639 #if no strand is given, make the proper strand and flip start and end if necessary
640 @a{qw
/-start -end -strand/} = _start_end_strand
(@a{qw
/-start -end/}) unless $a{-strand
};
642 #do some type mapping
643 my %type_map = ( similarity
=> 'nucleotide_motif',
644 exon
=> delete($a{is_alignment
}) ?
'match_part' : 'exon',
646 $a{-type
} = $type_map{ $a{-type
}} || $a{-type
};
648 #intercept target if present
650 if(my $t = delete $a{target
}) {
651 unless( $t->{-strand
} ) {
652 @
{$t}{qw
/-start -end -strand/} = _start_end_strand
(@
{$t}{qw
/-start -end/});
655 #HACK fix up target ids for geneseqer alignments
656 if( $t->{-target_id
} =~ s/([+-])$//) {
660 Bio
::Annotation
::Target
->new(%$t);
663 my $feature = Bio
::SeqFeature
::Annotated
->new(%a);
664 $feature->add_Annotation('Target',$target) if $target;
665 $feature->add_Annotation('ID',Bio
::Annotation
::SimpleValue
->new( -value
=> $a{id
} ) ) if $a{id
};
666 if( ref $a{supporting_alignments
} eq 'ARRAY' ) {
667 foreach ( @
{$a{-subfeatures
}} ) {
668 $feature->add_Annotation('supporting_alignment',Bio
::Annotation
::SimpleValue
->new( -value
=> $_ ));
669 # $feature->add_Annotation('Parent',Bio::Annotation::SimpleValue->new( -value => $_ ));
676 #object method that, given two (one-based) coordinates on a sequence
677 #and its length, returns the reverse complement of the coordinates in
679 sub _reverse_complement_coords
{
681 return ( $l-$e+1, $l-$s+1 );
684 #make a gff3-compliant feature start, end, and strand
685 #from a gamexml-style start and end that might be backwards
686 sub _start_end_strand
(@
) {
687 my ($start,$end) = @_;
689 return ($end,$start,-1);
691 return ($start,$end,1);
695 #extract all the (possibly fractional) numbers from a line of text,
696 #return them as a list
700 while( $line =~ /(\d+(?:\.\d+)?)/g ) {
709 sub _index_alignment
{
710 my ($self,$key,$feature) = @_;
711 # warn "indexing with '$key'\n";
712 $self->{alignment_features
}{$key} = $feature;
714 sub _purge_alignment_index
{
715 shift->{alignment_features
} = {};
717 sub _get_indexed_alignment
{
718 my ($self,$key) = @_;
719 return $self->{alignment_features
}{$key};
722 #chop the strand (+ or -) off of a seq name and return it as 1 for +
723 #or -1 for - note that this has the side effect of modifying the value
724 #of the thing that's passed to it
727 my $strand = chop $$ident;
728 if( $strand eq '+' ) {
731 elsif( $strand eq '-' ) {
734 die "Invalid strand '$strand' at end of identifier '$$ident$strand'";