a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / Bio / FeatureIO / gthxml.pm
blob5c0796337aecd9e9f4237eb1c48744fb48f4b8f6
1 =pod
3 =head1 NAME
5 Bio::FeatureIO::gthxml - FeatureIO parser for GenomeThreader XML reports
7 =head1 SYNOPSIS
9 my $feature_in = Bio::FeatureIO->new(-format => 'gthxml',
10 -file => 'myfile.gth.xml',
13 $feature_in->attach_alignments(1); #attach alignments to their AGSs
15 $feature_in->mode('both_separate'); #parse both alignments and PGLs from the
16 #report
18 while(my $feat = $feature_in->next_alignment) {
19 #do something with alignment feature
22 while(my $feat = $feature_in->next_feature) {
24 #get the alignment objects attached to the PGL feature
25 my @alignments =
26 map {
27 $_->value
28 } $feat->annotation->get_Annotations('supporting_alignment');
30 foreach my $alignment (@alignments) {
31 # $alignment is now a feature representing one of the EST
32 # alignments used to compute this PGL
38 =head1 DESCRIPTION
40 Parses GenomeThreader XML reports, returning L<Bio::SeqFeature::Annotated>
41 objects. Alignments are represented by a
42 L<Bio::SeqFeature::Annotated> object with a 'Target' annotation,
43 holding a L<Bio::Annotation::Target> object. PGLs are also
44 L<Bio::SeqFeature::Annotated> objects.
46 The mode() accessor controls the parsing mode, which is one of:
48 =over 4
50 =item 'pgls'
52 Only predicted gene location (PGL) features will be parsed and
53 returned. Supporting alignments in the file will be ignored.
55 =item 'alignments'
57 Only supporting EST/protein alignments will be returned, PGLs will be
58 ignored, meaning only next_alignment() will work, next_feature() will
59 return nothing.
61 =item 'both_separate'
63 Both PGLs and alignments will be parsed. next_feature() will return
64 PGLs only, and next_alignment() will return the alignments
66 =item 'both_merged'
68 Both PGLs and alignments will be parsed. next_feature() will return
69 B<both> PGLs and alignments. next_alignment will always return undef
71 =item 'alignments_merged'
73 Like 'alignments', except the alignments come out of next_feature()
74 instead of next_alignment()
76 =back
78 If the parse mode is set to 'both_separate' or 'both_merged', you can
79 also set the accessor attach_alignments() to have the supporting
80 alignment features attached to the AGS features in which they are
81 used. These are attached under the Annotation key/tag
82 'supporting_alignment'. This is off by default.
84 In the future, this module may be enhanced to have the capability of
85 producing L<Bio::SeqFeature::Generic> and
86 L<Bio::SeqFeature::FeaturePair> objects.
88 =head1 STRUCTURE OF RETURNED FEATURES
90 This module returns a bunch of hierarchical features (that is, the
91 features have subfeatures) that represent the GenomeThreader output.
92 The feature hierarchy for PGLs is arranged as:
94 gene - the PGL
95 |- mRNA - each alternative gene structure (AGS) in the PGL
96 |- five_prime_UTR (only present if there is one)
97 |- exon - each exon in that AGS
98 |- CDS - coding sequences in that AGS
99 |- three_prime_UTR (only present if there is one)
100 |- (optional) match - included if attach_alignments is on
101 |- match_part
103 and for alignments it is
105 match - spliced alignment of an EST/protein to the ref seq
106 |- match_part - each exon in the spliced alignment
108 Note that the alignments (match and match_part features) will only be
109 attached to the gene features if attach_alignments() is set to true.
110 Attaching the aligments to the gene features requires keeping all of
111 the alignments in memory, and if you're parsing a very large
112 GenomeThreader report, that may not be possible. Therefore,
113 attach_alignments() defaults to off().
115 =head1 REDUCING MEMORY REQUIREMENTS
117 If you find yourself running out of memory while parsing large
118 GenomeThreader output files with this parser, there are a couple of things
119 you can do to decrease the memory requirements:
121 1. do not use the 'both_separate' parsing mode, since it can sometimes
122 need to keep a large buffer of features in memory
123 2. make sure attach_alignments() is set to false (0 or '')
125 If these two measures are not adequate for you, please explain your
126 problem on the mailing list, and we may be able to address your
127 situation.
129 =head1 FEEDBACK
131 =head2 Mailing Lists
133 User feedback is an integral part of the evolution of this and other
134 Bioperl modules. Send your comments and suggestions preferably to
135 the Bioperl mailing list. Your participation is much appreciated.
137 bioperl-l@bioperl.org - General discussion
138 http://bioperl.org/MailList.shtml - About the mailing lists
140 =head2 Reporting Bugs
142 Report bugs to the Bioperl bug tracking system to help us keep track
143 of the bugs and their resolution. Bug reports can be submitted via
144 the web:
146 http://bugzilla.bioperl.org/
148 =head1 AUTHOR
150 Robert Buels, rmb32@cornell.edu
152 =head1 APPENDIX
154 The rest of the documentation details each of the object methods.
155 Internal methods are usually preceded with a _
157 =cut
159 # Let the code begin...
161 package Bio::FeatureIO::gthxml;
162 use strict;
163 use English;
165 use base qw(Bio::FeatureIO);
167 use XML::Twig;
169 use Bio::SeqFeature::Annotated;
170 use Bio::Annotation::Target;
172 =head2 new()
174 constructor options for this parser are:
176 -mode
177 takes same arguments as mode() accessor below
178 -attach_alignments
179 takes same arguments as attach_alignments() accessor below
181 =cut
183 sub _initialize {
184 my($self,%arg) = @_;
186 $self->SUPER::_initialize(%arg);
188 #init buffers
189 $self->{alignment_buffer} = [];
190 $self->{feature_buffer} = [];
192 #set defaults
193 $arg{-mode} ||= 'pgls';
194 $arg{-attach_alignments} = 0 unless defined $arg{-attach_alignments};
196 #set options
197 $self->mode($arg{-mode});
198 $self->attach_alignments($arg{-attach_alignments});
201 my $alignment_buffer = $self->mode eq 'both_merged' || $self->mode eq 'alignments_merged'
202 ? $self->{feature_buffer}
203 : $self->{alignment_buffer};
204 my @spliced_alignment_twig = $self->mode eq 'pgls'
205 ? ()
206 : (spliced_alignment => sub {
207 push @$alignment_buffer,
208 $self->_parse_alignment(@_);
209 shift->purge;
211 my @pgl_twig = $self->mode eq 'alignments' || $self->mode eq 'alignments_merged'
212 ? ()
213 : (predicted_gene_location => sub {
214 push @{$self->{feature_buffer}},
215 $self->_parse_pgl(@_);
216 shift->purge;
219 #now parse the entire input file, buffering pgls and alignments
220 eval {
221 XML::Twig->new( twig_roots =>
223 @spliced_alignment_twig,
224 @pgl_twig,
226 )->parse($self->_fh);
228 if( $EVAL_ERROR ) {
229 $self->throw("error parsing GTH XML file '".$self->file."': $EVAL_ERROR");
234 =head2 mode()
236 Usage: $fio->mode('alignments');
237 #parse and return only alignments
238 Desc : get/set the current parsing mode. Defaults to
239 'pgls'
240 Ret : currently set parse mode
241 Args : (optional) new parse mode, which is one of 'pgls',
242 'alignments', 'both_separate', or 'both_merged'
244 =cut
246 sub mode {
247 my ($self,$newmode) = @_;
248 if(defined $newmode) {
249 grep {$_ eq $newmode} qw/alignments pgls both_separate both_merged alignments_merged/
250 or $self->throw("invalid mode selection $newmode");
251 $self->{parse_mode} = $newmode;
253 return $self->{parse_mode};
256 =head2 attach_alignments()
258 Usage: $fio->attach_alignments(1);
259 Desc : get/set a flag telling the parser whether to attach
260 the supporting alignment features to each AGS (gene
261 feature) returned.
262 Off by default.
263 Attaches the alignments under a tag/Annotation
264 named 'supporting_alignment'.
265 Ret : current value of this flag.
266 Args : (optional) new true/false value of this flag
268 =cut
270 sub attach_alignments {
271 my ($self,$newval) = @_;
272 if(defined $newval) {
273 $self->{attach_alignments} = $newval;
275 return $self->{attach_alignments};
279 =head2 next_feature()
281 Usage : my $feature = $featureio->next_feature();
282 Function: returns the next available feature, which is either a PGL
283 or an alignment, depending on mode() (see above). Features
284 will be returned in the same order as they appear in the file.
285 Returns : a predicted gene location (PGL) feature, of class
286 L<Bio::SeqFeature::Annotated>
287 Args : none
289 =cut
291 # next_feature
292 # return 'pgl/alignment' if in buffer
293 # else buffer 'pgl/alignment' and return it
294 sub next_feature {
295 my ($self,$featuretype) = @_;
296 $featuretype ||= 'pgl';
298 $self->_validate_settings;
300 return $self->next_alignment if $featuretype eq 'alignment';
302 $self->throw("invalid featuretype '$featuretype'") unless $featuretype eq 'pgl';
304 return shift @{$self->{feature_buffer}};
307 =head2 next_alignment()
309 Usage: my $feature = $fio->next_alignment
310 Desc : get the next EST/protein alignment feature from the
311 GenomeThreader report. The alignments will be returned in
312 the same order in which they appear in the file.
313 Ret : an alignment feature, or undef if there are no more
314 Args : none
316 =cut
318 # next_alignment
319 # alias for next_feature('alignment')
320 sub next_alignment {
321 my ($self) = @_;
323 $self->_validate_settings;
325 return shift @{$self->{alignment_buffer}};
328 sub _validate_settings {
329 my $self = shift;
330 #validate attach_alignments and mode selections
331 if( $self->attach_alignments && $self->mode !~ /^both_/ ) {
332 $self->throw("mode must be set to 'both_separate' or 'both_merged' if attach_alignments() is set to true");
336 =head2 write_feature()
338 Not implemented.
340 =cut
342 sub write_feature {
343 shift->throw_not_implemented;
346 #given the featureio object, an XML::Twig object, and an XML::Twig
347 #element representing a <spliced_alignment> element, make a
348 #feature for this alignment
349 sub _parse_alignment {
350 my ($self,$twig,$element) = @_;
352 ### parse out some basic things about the main alignment
353 my $matchline = $element->first_descendant('MATCH_line');
354 my ($seq_id,$target_id) = map {$matchline->att($_)} qw/gen_id ref_id/;
355 my $score = $matchline->first_child('total_alignment_score')->text;
357 ### get the strandedness w.r.t. the reference seq
358 my $rstrand = $element->first_child('reference')->att('ref_strand');
360 #make features for all the sub-alignments
361 my @rstarts;
362 my @rends;
363 my ($start,$end);
364 my @subfeats = map {
365 my $gen = $_->first_child('gDNA_exon_boundary');
366 my $ref = $_->first_child('reference_exon_boundary');
367 my ($gstart,$gend,$gstrand) = _start_end_strand($gen->att('g_start'), $gen->att('g_stop'));
368 my ($rstart,$rend) = ($ref->att('r_start'), $ref->att('r_stop'));
369 push @rstarts,$rstart; push @rends,$rend;
370 my $subscore = $ref->att('r_score');
371 $start = $gstart if ! defined $start || $start > $gstart;
372 $end = $gend if ! defined $end || $end < $gend;
373 $self->_new_feature( -start => $gstart, -end => $gend, -strand => $gstrand, -score => $subscore,
374 -seq_id => $seq_id,
375 -type => 'match_part',
376 -target => { -target_id => $target_id,
377 -start => $rstart, -end => $rend, -strand => $rstrand,
380 } $element->first_child('predicted_gene_structure')->first_child('exon-intron_info')->children('exon');
382 my $strand = $subfeats[0]->strand;
384 #now make the main alignment
385 my $alignment = $self->_new_feature( -start => $start, -end => $end, -strand => $strand,
386 -score => $score,
387 -seq_id => $seq_id,
388 -type => 'match',
389 -target => { -target_id => $target_id,
390 -start => (sort {$a<=>$b} @rstarts)[0],
391 -end => (sort {$b<=>$a} @rends)[0],
392 -strand => $rstrand,
394 -subfeatures => \@subfeats,
397 #if we're attaching alignments to the PGLs that use them,
398 #index this alignment for later use
399 if( $self->attach_alignments ) {
400 my $pgs_sig = $self->_pgs_line_to_sig($element->first_descendant('PGS_line'));
401 $self->_index_alignment($pgs_sig,$alignment);
404 return $alignment;
407 #given a <PGS_line> element and an optional ID of the genomic sequence
408 #we're looking at, either from a PGL or an alignment section, parse it
409 #and return a string that can serve as the unique key for a hash of
410 #alignments
411 sub _pgs_line_to_sig {
412 my ($self,$pgsline_element,$genomic_id) = @_;
413 my $ref_id = do {
414 if(my $elem = $pgsline_element->first_child('rDNA')) {
415 $elem->att('rDNA_id')
417 elsif($elem = $pgsline_element->first_child('referenceDNA') ) {
418 $elem->att('id')
421 $genomic_id ||= ($pgsline_element->first_child('gDNA') or $self->throw('parse error'))->att('gen_id');
422 return join ',',
423 ( $ref_id,
424 $genomic_id,
425 map {
426 ( $_->att('e_start') || $_->att('start'),
427 $_->att('e_stop') || $_->att('stop'),
429 } $pgsline_element->descendants('exon'),
433 #given the featureio object, the twig object, and a twig element for a
434 #<predicted_gene_location>, make a feature and subfeatures for that
435 #pgl
436 sub _parse_pgl {
437 my ($self,$twig,$element) = @_;
439 my @subfeats = map {
440 $self->_parse_ags($twig,$_);
441 } $element->children('AGS_information');
443 return () unless @subfeats;
445 #parse out the start, end, and strand of this PGL
446 my ($start,$end,$strand) = do {
447 my $pgl_line = $element->first_child('PGL_line');
448 map {$pgl_line->att($_)} qw/PGL_start PGL_stop PGL_strand/;
450 my $strand2;
451 ($start,$end,$strand2) = _start_end_strand($start,$end);
452 $strand2 = $strand2 == 1 ? '+' : $strand2 == -1 ? '-' : $self->throw("invalid strand $strand2");
453 $strand2 eq $strand or $self->throw("parsing consistency error ('$strand' vs '$strand2'");
455 my $pgl_feature = $self->_new_feature( -start => $start,
456 -end => $end,
457 -strand => $strand,
458 -seq_id => $subfeats[0]->seq_id,
459 -type => 'gene',
460 -subfeatures => \@subfeats,
462 if( $self->attach_alignments ) {
463 foreach my $pgsline ($element->first_descendant('supporting_evidence')->children('PGS_line')) {
464 #use the <PGS_line> to look up the alignment features that were
465 #used for this, and attach them to this feature
466 my $pgs_sig = $self->_pgs_line_to_sig($pgsline,$subfeats[0]->seq_id);
467 my $alignment_feature = $self->_get_indexed_alignment($pgs_sig)
468 or $self->throw("parse error, cannot look up alignment with signature '$pgs_sig', available sigs are\n".join("\n",keys %{$self->{alignment_features}}));
469 $pgl_feature->add_Annotation('supporting_alignment',
470 Bio::Annotation::SimpleValue->new( -value => $alignment_feature ),
474 return $pgl_feature;
477 sub _parse_ags {
478 my ($self,$twig,$element) = @_;
481 my $seq_id = do {
482 if(my $gdna = $element->first_descendant('gDNA')) {
483 $gdna->att('id');
485 elsif($gdna = $element->first_descendant('none')) {
486 $gdna->att('gDNA_id');
489 unless($seq_id) {
490 $self->warn('WARNING: could not determine genomic sequence ID for AGS with no probable_ORFs. This XML output issue is fixed in genomethreader 0.9.54 and later, please consider upgrading.');
491 return ();
495 my @exons = map {
496 my $e = $_;
497 my $boundary = $e->first_child('gDNA_exon_boundary');
498 my ($start,$end,$strand) = _start_end_strand($boundary->att('e_start'), $boundary->att('e_stop'));
499 my $score = $e->att('e_score');
500 $self->_new_feature( -start => $start, -end => $end, -strand => $strand,
501 -score => $score,
502 -seq_id=>$seq_id,
503 -type => 'exon',
505 } $element->first_child('exon-intron_info')->children('exon');
507 my ($start,$end,$strand) = do {
508 my $coords = $element->first_child('AGS_line')->first_child('exon_coordinates');
509 _start_end_strand( $coords->first_child('exon')->att('e_start'),
510 $coords->last_child('exon')->att('e_stop'),
514 return $self->_new_feature( -start => $start,
515 -end => $end,
516 -strand => $strand,
517 -seq_id => $seq_id,
518 -type => 'mRNA',
519 -subfeatures => \@exons,
523 #object method to create a new feature object, with some defaults and
524 #automation of the more repetitive bits (like adding targets and
525 #subfeatures)
526 sub _new_feature(@) {
527 my ($self,%a) = @_;
529 UNIVERSAL::isa($self,__PACKAGE__)
530 or die('_new_feature is an object method, silly');
532 # use Data::Dumper;
533 # warn "got feature: ",Dumper(\%a);
535 $a{-seq_id} ||= $self->{current_genomic_sequence};
537 #replace spaces in source with underscores
538 $a{-source} ||= 'GenomeThreader';
539 $a{-source} =~ s/\s+/_/g;
541 #if no strand is given, make the proper strand and flip start and
542 #end if necessary
543 @a{qw/-start -end -strand/} = _start_end_strand(@a{qw/-start -end/}) unless $a{-strand};
545 #do some type mapping
546 my %type_map = ( similarity => 'nucleotide_motif',
547 exon => delete($a{is_alignment}) ? 'match_part' : 'exon',
549 $a{-type} = $type_map{ $a{-type}} || $a{-type};
551 #intercept target if present
552 my $target = do {
553 if(my $t = delete $a{-target}) {
554 unless( $t->{-strand} ) {
555 @{$t}{qw/-start -end -strand/} = _start_end_strand(@{$t}{qw/-start -end/});
558 #HACK fix up target ids for alignments
559 if( $t->{-target_id} =~ s/([+-])$//) {
560 $t->{-strand} = $1;
563 Bio::Annotation::Target->new(%$t);
566 my $feature = Bio::SeqFeature::Annotated->new(%a);
567 $feature->add_Annotation('Target',$target) if $target;
568 $feature->add_Annotation('ID',Bio::Annotation::SimpleValue->new( -value => $a{id} ) ) if $a{id};
569 if( ref $a{supporting_alignments} eq 'ARRAY' ) {
570 foreach ( @{$a{-subfeatures}} ) {
571 $feature->add_Annotation('supporting_alignment',Bio::Annotation::SimpleValue->new( -value => $_ ));
574 return $feature;
578 #object method that, given two (one-based) coordinates on a sequence
579 #and its length, returns the reverse complement of the coordinates in
580 #a two-element list
581 sub _reverse_complement_coords {
582 my ($s,$e,$l) = @_;
583 return ( $l-$e+1, $l-$s+1 );
586 #make a gff3-compliant feature start, end, and strand
587 #from a gamexml-style start and end that might be backwards
588 sub _start_end_strand(@) {
589 my ($start,$end) = @_;
590 if($start > $end) {
591 return ($end,$start,-1);
592 } else {
593 return ($start,$end,1);
597 # lookup_alignment
598 # get_alignment
599 # purge_alignments
600 sub _index_alignment {
601 my ($self,$key,$feature) = @_;
602 # warn "indexing with '$key'\n";
603 $self->{alignment_features}{$key} = $feature;
605 sub _purge_alignment_index {
606 shift->{alignment_features} = {};
608 sub _get_indexed_alignment {
609 my ($self,$key) = @_;
610 return $self->{alignment_features}{$key};
613 #chop the strand (+ or -) off of a seq name and return it as 1 for +
614 #or -1 for - note that this has the side effect of modifying the value
615 #of the thing that's passed to it
616 sub _chop_strand {
617 my ($ident) = @_;
618 my $strand = chop $$ident;
619 if( $strand eq '+' ) {
620 return 1;
622 elsif( $strand eq '-' ) {
623 return -1;
625 die "Invalid strand '$strand' at end of identifier '$$ident$strand'";
629 1;# do not remove