5 Bio::FeatureIO::gthxml - FeatureIO parser for GenomeThreader XML reports
9 my $feature_in = Bio::FeatureIO->new(-format => 'gthxml',
10 -file => 'myfile.gth.xml',
13 $feature_in->attach_alignments(1); #attach alignments to their AGSs
15 $feature_in->mode('both_separate'); #parse both alignments and PGLs from the
18 while(my $feat = $feature_in->next_alignment) {
19 #do something with alignment feature
22 while(my $feat = $feature_in->next_feature) {
24 #get the alignment objects attached to the PGL feature
28 } $feat->annotation->get_Annotations('supporting_alignment');
30 foreach my $alignment (@alignments) {
31 # $alignment is now a feature representing one of the EST
32 # alignments used to compute this PGL
40 Parses GenomeThreader XML reports, returning L<Bio::SeqFeature::Annotated>
41 objects. Alignments are represented by a
42 L<Bio::SeqFeature::Annotated> object with a 'Target' annotation,
43 holding a L<Bio::Annotation::Target> object. PGLs are also
44 L<Bio::SeqFeature::Annotated> objects.
46 The mode() accessor controls the parsing mode, which is one of:
52 Only predicted gene location (PGL) features will be parsed and
53 returned. Supporting alignments in the file will be ignored.
57 Only supporting EST/protein alignments will be returned, PGLs will be
58 ignored, meaning only next_alignment() will work, next_feature() will
63 Both PGLs and alignments will be parsed. next_feature() will return
64 PGLs only, and next_alignment() will return the alignments
68 Both PGLs and alignments will be parsed. next_feature() will return
69 B<both> PGLs and alignments. next_alignment will always return undef
71 =item 'alignments_merged'
73 Like 'alignments', except the alignments come out of next_feature()
74 instead of next_alignment()
78 If the parse mode is set to 'both_separate' or 'both_merged', you can
79 also set the accessor attach_alignments() to have the supporting
80 alignment features attached to the AGS features in which they are
81 used. These are attached under the Annotation key/tag
82 'supporting_alignment'. This is off by default.
84 In the future, this module may be enhanced to have the capability of
85 producing L<Bio::SeqFeature::Generic> and
86 L<Bio::SeqFeature::FeaturePair> objects.
88 =head1 STRUCTURE OF RETURNED FEATURES
90 This module returns a bunch of hierarchical features (that is, the
91 features have subfeatures) that represent the GenomeThreader output.
92 The feature hierarchy for PGLs is arranged as:
95 |- mRNA - each alternative gene structure (AGS) in the PGL
96 |- five_prime_UTR (only present if there is one)
97 |- exon - each exon in that AGS
98 |- CDS - coding sequences in that AGS
99 |- three_prime_UTR (only present if there is one)
100 |- (optional) match - included if attach_alignments is on
103 and for alignments it is
105 match - spliced alignment of an EST/protein to the ref seq
106 |- match_part - each exon in the spliced alignment
108 Note that the alignments (match and match_part features) will only be
109 attached to the gene features if attach_alignments() is set to true.
110 Attaching the aligments to the gene features requires keeping all of
111 the alignments in memory, and if you're parsing a very large
112 GenomeThreader report, that may not be possible. Therefore,
113 attach_alignments() defaults to off().
115 =head1 REDUCING MEMORY REQUIREMENTS
117 If you find yourself running out of memory while parsing large
118 GenomeThreader output files with this parser, there are a couple of things
119 you can do to decrease the memory requirements:
121 1. do not use the 'both_separate' parsing mode, since it can sometimes
122 need to keep a large buffer of features in memory
123 2. make sure attach_alignments() is set to false (0 or '')
125 If these two measures are not adequate for you, please explain your
126 problem on the mailing list, and we may be able to address your
133 User feedback is an integral part of the evolution of this and other
134 Bioperl modules. Send your comments and suggestions preferably to
135 the Bioperl mailing list. Your participation is much appreciated.
137 bioperl-l@bioperl.org - General discussion
138 http://bioperl.org/MailList.shtml - About the mailing lists
140 =head2 Reporting Bugs
142 Report bugs to the Bioperl bug tracking system to help us keep track
143 of the bugs and their resolution. Bug reports can be submitted via
146 http://bugzilla.bioperl.org/
150 Robert Buels, rmb32@cornell.edu
154 The rest of the documentation details each of the object methods.
155 Internal methods are usually preceded with a _
159 # Let the code begin...
161 package Bio
::FeatureIO
::gthxml
;
165 use base
qw(Bio::FeatureIO);
169 use Bio
::SeqFeature
::Annotated
;
170 use Bio
::Annotation
::Target
;
174 constructor options for this parser are:
177 takes same arguments as mode() accessor below
179 takes same arguments as attach_alignments() accessor below
186 $self->SUPER::_initialize
(%arg);
189 $self->{alignment_buffer
} = [];
190 $self->{feature_buffer
} = [];
193 $arg{-mode
} ||= 'pgls';
194 $arg{-attach_alignments
} = 0 unless defined $arg{-attach_alignments
};
197 $self->mode($arg{-mode
});
198 $self->attach_alignments($arg{-attach_alignments
});
201 my $alignment_buffer = $self->mode eq 'both_merged' || $self->mode eq 'alignments_merged'
202 ?
$self->{feature_buffer
}
203 : $self->{alignment_buffer
};
204 my @spliced_alignment_twig = $self->mode eq 'pgls'
206 : (spliced_alignment
=> sub {
207 push @
$alignment_buffer,
208 $self->_parse_alignment(@_);
211 my @pgl_twig = $self->mode eq 'alignments' || $self->mode eq 'alignments_merged'
213 : (predicted_gene_location
=> sub {
214 push @
{$self->{feature_buffer
}},
215 $self->_parse_pgl(@_);
219 #now parse the entire input file, buffering pgls and alignments
221 XML
::Twig
->new( twig_roots
=>
223 @spliced_alignment_twig,
226 )->parse($self->_fh);
229 $self->throw("error parsing GTH XML file '".$self->file."': $EVAL_ERROR");
236 Usage: $fio->mode('alignments');
237 #parse and return only alignments
238 Desc : get/set the current parsing mode. Defaults to
240 Ret : currently set parse mode
241 Args : (optional) new parse mode, which is one of 'pgls',
242 'alignments', 'both_separate', or 'both_merged'
247 my ($self,$newmode) = @_;
248 if(defined $newmode) {
249 grep {$_ eq $newmode} qw
/alignments pgls both_separate both_merged alignments_merged/
250 or $self->throw("invalid mode selection $newmode");
251 $self->{parse_mode
} = $newmode;
253 return $self->{parse_mode
};
256 =head2 attach_alignments()
258 Usage: $fio->attach_alignments(1);
259 Desc : get/set a flag telling the parser whether to attach
260 the supporting alignment features to each AGS (gene
263 Attaches the alignments under a tag/Annotation
264 named 'supporting_alignment'.
265 Ret : current value of this flag.
266 Args : (optional) new true/false value of this flag
270 sub attach_alignments
{
271 my ($self,$newval) = @_;
272 if(defined $newval) {
273 $self->{attach_alignments
} = $newval;
275 return $self->{attach_alignments
};
279 =head2 next_feature()
281 Usage : my $feature = $featureio->next_feature();
282 Function: returns the next available feature, which is either a PGL
283 or an alignment, depending on mode() (see above). Features
284 will be returned in the same order as they appear in the file.
285 Returns : a predicted gene location (PGL) feature, of class
286 L<Bio::SeqFeature::Annotated>
292 # return 'pgl/alignment' if in buffer
293 # else buffer 'pgl/alignment' and return it
295 my ($self,$featuretype) = @_;
296 $featuretype ||= 'pgl';
298 $self->_validate_settings;
300 return $self->next_alignment if $featuretype eq 'alignment';
302 $self->throw("invalid featuretype '$featuretype'") unless $featuretype eq 'pgl';
304 return shift @
{$self->{feature_buffer
}};
307 =head2 next_alignment()
309 Usage: my $feature = $fio->next_alignment
310 Desc : get the next EST/protein alignment feature from the
311 GenomeThreader report. The alignments will be returned in
312 the same order in which they appear in the file.
313 Ret : an alignment feature, or undef if there are no more
319 # alias for next_feature('alignment')
323 $self->_validate_settings;
325 return shift @
{$self->{alignment_buffer
}};
328 sub _validate_settings
{
330 #validate attach_alignments and mode selections
331 if( $self->attach_alignments && $self->mode !~ /^both_/ ) {
332 $self->throw("mode must be set to 'both_separate' or 'both_merged' if attach_alignments() is set to true");
336 =head2 write_feature()
343 shift->throw_not_implemented;
346 #given the featureio object, an XML::Twig object, and an XML::Twig
347 #element representing a <spliced_alignment> element, make a
348 #feature for this alignment
349 sub _parse_alignment
{
350 my ($self,$twig,$element) = @_;
352 ### parse out some basic things about the main alignment
353 my $matchline = $element->first_descendant('MATCH_line');
354 my ($seq_id,$target_id) = map {$matchline->att($_)} qw
/gen_id ref_id/;
355 my $score = $matchline->first_child('total_alignment_score')->text;
357 ### get the strandedness w.r.t. the reference seq
358 my $rstrand = $element->first_child('reference')->att('ref_strand');
360 #make features for all the sub-alignments
365 my $gen = $_->first_child('gDNA_exon_boundary');
366 my $ref = $_->first_child('reference_exon_boundary');
367 my ($gstart,$gend,$gstrand) = _start_end_strand
($gen->att('g_start'), $gen->att('g_stop'));
368 my ($rstart,$rend) = ($ref->att('r_start'), $ref->att('r_stop'));
369 push @rstarts,$rstart; push @rends,$rend;
370 my $subscore = $ref->att('r_score');
371 $start = $gstart if ! defined $start || $start > $gstart;
372 $end = $gend if ! defined $end || $end < $gend;
373 $self->_new_feature( -start
=> $gstart, -end
=> $gend, -strand
=> $gstrand, -score
=> $subscore,
375 -type
=> 'match_part',
376 -target
=> { -target_id
=> $target_id,
377 -start
=> $rstart, -end
=> $rend, -strand
=> $rstrand,
380 } $element->first_child('predicted_gene_structure')->first_child('exon-intron_info')->children('exon');
382 my $strand = $subfeats[0]->strand;
384 #now make the main alignment
385 my $alignment = $self->_new_feature( -start
=> $start, -end
=> $end, -strand
=> $strand,
389 -target
=> { -target_id
=> $target_id,
390 -start
=> (sort {$a<=>$b} @rstarts)[0],
391 -end
=> (sort {$b<=>$a} @rends)[0],
394 -subfeatures
=> \
@subfeats,
397 #if we're attaching alignments to the PGLs that use them,
398 #index this alignment for later use
399 if( $self->attach_alignments ) {
400 my $pgs_sig = $self->_pgs_line_to_sig($element->first_descendant('PGS_line'));
401 $self->_index_alignment($pgs_sig,$alignment);
407 #given a <PGS_line> element and an optional ID of the genomic sequence
408 #we're looking at, either from a PGL or an alignment section, parse it
409 #and return a string that can serve as the unique key for a hash of
411 sub _pgs_line_to_sig
{
412 my ($self,$pgsline_element,$genomic_id) = @_;
414 if(my $elem = $pgsline_element->first_child('rDNA')) {
415 $elem->att('rDNA_id')
417 elsif($elem = $pgsline_element->first_child('referenceDNA') ) {
421 $genomic_id ||= ($pgsline_element->first_child('gDNA') or $self->throw('parse error'))->att('gen_id');
426 ( $_->att('e_start') || $_->att('start'),
427 $_->att('e_stop') || $_->att('stop'),
429 } $pgsline_element->descendants('exon'),
433 #given the featureio object, the twig object, and a twig element for a
434 #<predicted_gene_location>, make a feature and subfeatures for that
437 my ($self,$twig,$element) = @_;
440 $self->_parse_ags($twig,$_);
441 } $element->children('AGS_information');
443 return () unless @subfeats;
445 #parse out the start, end, and strand of this PGL
446 my ($start,$end,$strand) = do {
447 my $pgl_line = $element->first_child('PGL_line');
448 map {$pgl_line->att($_)} qw
/PGL_start PGL_stop PGL_strand/;
451 ($start,$end,$strand2) = _start_end_strand
($start,$end);
452 $strand2 = $strand2 == 1 ?
'+' : $strand2 == -1 ?
'-' : $self->throw("invalid strand $strand2");
453 $strand2 eq $strand or $self->throw("parsing consistency error ('$strand' vs '$strand2'");
455 my $pgl_feature = $self->_new_feature( -start
=> $start,
458 -seq_id
=> $subfeats[0]->seq_id,
460 -subfeatures
=> \
@subfeats,
462 if( $self->attach_alignments ) {
463 foreach my $pgsline ($element->first_descendant('supporting_evidence')->children('PGS_line')) {
464 #use the <PGS_line> to look up the alignment features that were
465 #used for this, and attach them to this feature
466 my $pgs_sig = $self->_pgs_line_to_sig($pgsline,$subfeats[0]->seq_id);
467 my $alignment_feature = $self->_get_indexed_alignment($pgs_sig)
468 or $self->throw("parse error, cannot look up alignment with signature '$pgs_sig', available sigs are\n".join("\n",keys %{$self->{alignment_features
}}));
469 $pgl_feature->add_Annotation('supporting_alignment',
470 Bio
::Annotation
::SimpleValue
->new( -value
=> $alignment_feature ),
478 my ($self,$twig,$element) = @_;
482 if(my $gdna = $element->first_descendant('gDNA')) {
485 elsif($gdna = $element->first_descendant('none')) {
486 $gdna->att('gDNA_id');
490 $self->warn('WARNING: could not determine genomic sequence ID for AGS with no probable_ORFs. This XML output issue is fixed in genomethreader 0.9.54 and later, please consider upgrading.');
497 my $boundary = $e->first_child('gDNA_exon_boundary');
498 my ($start,$end,$strand) = _start_end_strand
($boundary->att('e_start'), $boundary->att('e_stop'));
499 my $score = $e->att('e_score');
500 $self->_new_feature( -start
=> $start, -end
=> $end, -strand
=> $strand,
505 } $element->first_child('exon-intron_info')->children('exon');
507 my ($start,$end,$strand) = do {
508 my $coords = $element->first_child('AGS_line')->first_child('exon_coordinates');
509 _start_end_strand
( $coords->first_child('exon')->att('e_start'),
510 $coords->last_child('exon')->att('e_stop'),
514 return $self->_new_feature( -start
=> $start,
519 -subfeatures
=> \
@exons,
523 #object method to create a new feature object, with some defaults and
524 #automation of the more repetitive bits (like adding targets and
526 sub _new_feature
(@
) {
529 UNIVERSAL
::isa
($self,__PACKAGE__
)
530 or die('_new_feature is an object method, silly');
533 # warn "got feature: ",Dumper(\%a);
535 $a{-seq_id
} ||= $self->{current_genomic_sequence
};
537 #replace spaces in source with underscores
538 $a{-source
} ||= 'GenomeThreader';
539 $a{-source
} =~ s/\s+/_/g;
541 #if no strand is given, make the proper strand and flip start and
543 @a{qw
/-start -end -strand/} = _start_end_strand
(@a{qw
/-start -end/}) unless $a{-strand
};
545 #do some type mapping
546 my %type_map = ( similarity
=> 'nucleotide_motif',
547 exon
=> delete($a{is_alignment
}) ?
'match_part' : 'exon',
549 $a{-type
} = $type_map{ $a{-type
}} || $a{-type
};
551 #intercept target if present
553 if(my $t = delete $a{-target
}) {
554 unless( $t->{-strand
} ) {
555 @
{$t}{qw
/-start -end -strand/} = _start_end_strand
(@
{$t}{qw
/-start -end/});
558 #HACK fix up target ids for alignments
559 if( $t->{-target_id
} =~ s/([+-])$//) {
563 Bio
::Annotation
::Target
->new(%$t);
566 my $feature = Bio
::SeqFeature
::Annotated
->new(%a);
567 $feature->add_Annotation('Target',$target) if $target;
568 $feature->add_Annotation('ID',Bio
::Annotation
::SimpleValue
->new( -value
=> $a{id
} ) ) if $a{id
};
569 if( ref $a{supporting_alignments
} eq 'ARRAY' ) {
570 foreach ( @
{$a{-subfeatures
}} ) {
571 $feature->add_Annotation('supporting_alignment',Bio
::Annotation
::SimpleValue
->new( -value
=> $_ ));
578 #object method that, given two (one-based) coordinates on a sequence
579 #and its length, returns the reverse complement of the coordinates in
581 sub _reverse_complement_coords
{
583 return ( $l-$e+1, $l-$s+1 );
586 #make a gff3-compliant feature start, end, and strand
587 #from a gamexml-style start and end that might be backwards
588 sub _start_end_strand
(@
) {
589 my ($start,$end) = @_;
591 return ($end,$start,-1);
593 return ($start,$end,1);
600 sub _index_alignment
{
601 my ($self,$key,$feature) = @_;
602 # warn "indexing with '$key'\n";
603 $self->{alignment_features
}{$key} = $feature;
605 sub _purge_alignment_index
{
606 shift->{alignment_features
} = {};
608 sub _get_indexed_alignment
{
609 my ($self,$key) = @_;
610 return $self->{alignment_features
}{$key};
613 #chop the strand (+ or -) off of a seq name and return it as 1 for +
614 #or -1 for - note that this has the side effect of modifying the value
615 #of the thing that's passed to it
618 my $strand = chop $$ident;
619 if( $strand eq '+' ) {
622 elsif( $strand eq '-' ) {
625 die "Invalid strand '$strand' at end of identifier '$$ident$strand'";