lib/Bio/FeatureIO/gthxml.pm

   1 =pod
   2
   3 =head1 NAME
   4
   5 Bio::FeatureIO::gthxml - FeatureIO parser for GenomeThreader XML reports
   6
   7 =head1 SYNOPSIS
   8
   9   my $feature_in = Bio::FeatureIO->new(-format => 'gthxml',
  10                                        -file   => 'myfile.gth.xml',
  11                                       );
  12
  13   $feature_in->attach_alignments(1); #attach alignments to their AGSs
  14
  15   $feature_in->mode('both_separate'); #parse both alignments and PGLs from the
  16                                       #report
  17
  18   while(my $feat = $feature_in->next_alignment) {
  19     #do something with alignment feature
  20   }
  21
  22   while(my $feat = $feature_in->next_feature) {
  23
  24     #get the alignment objects attached to the PGL feature
  25     my @alignments =
  26       map {
  27         $_->value
  28       } $feat->annotation->get_Annotations('supporting_alignment');
  29
  30     foreach my $alignment (@alignments) {
  31       # $alignment is now a feature representing one of the EST
  32       # alignments used to compute this PGL
  33     }
  34
  35   }
  36
  37
  38 =head1 DESCRIPTION
  39
  40 Parses GenomeThreader XML reports, returning L<Bio::SeqFeature::Annotated>
  41 objects.  Alignments are represented by a
  42 L<Bio::SeqFeature::Annotated> object with a 'Target' annotation,
  43 holding a L<Bio::Annotation::Target> object.  PGLs are also
  44 L<Bio::SeqFeature::Annotated> objects.
  45
  46 The mode() accessor controls the parsing mode, which is one of:
  47
  48 =over 4
  49
  50 =item 'pgls'
  51
  52 Only predicted gene location (PGL) features will be parsed and
  53 returned.  Supporting alignments in the file will be ignored.
  54
  55 =item 'alignments'
  56
  57 Only supporting EST/protein alignments will be returned, PGLs will be
  58 ignored, meaning only next_alignment() will work, next_feature() will
  59 return nothing.
  60
  61 =item 'both_separate'
  62
  63 Both PGLs and alignments will be parsed.  next_feature() will return
  64 PGLs only, and next_alignment() will return the alignments
  65
  66 =item 'both_merged'
  67
  68 Both PGLs and alignments will be parsed.  next_feature() will return
  69 B<both> PGLs and alignments.  next_alignment will always return undef
  70
  71 =item 'alignments_merged'
  72
  73 Like 'alignments', except the alignments come out of next_feature()
  74 instead of next_alignment()
  75
  76 =back
  77
  78 If the parse mode is set to 'both_separate' or 'both_merged', you can
  79 also set the accessor attach_alignments() to have the supporting
  80 alignment features attached to the AGS features in which they are
  81 used.  These are attached under the Annotation key/tag
  82 'supporting_alignment'.  This is off by default.
  83
  84 In the future, this module may be enhanced to have the capability of
  85 producing L<Bio::SeqFeature::Generic> and
  86 L<Bio::SeqFeature::FeaturePair> objects.
  87
  88 =head1 STRUCTURE OF RETURNED FEATURES
  89
  90 This module returns a bunch of hierarchical features (that is, the
  91 features have subfeatures) that represent the GenomeThreader output.
  92 The feature hierarchy for PGLs is arranged as:
  93
  94   gene                       - the PGL
  95    |- mRNA - each alternative gene structure (AGS) in the PGL
  96        |- five_prime_UTR (only present if there is one)
  97        |- exon    - each exon in that AGS
  98        |- CDS     - coding sequences in that AGS
  99        |- three_prime_UTR (only present if there is one)
 100        |- (optional) match     - included if attach_alignments is on
 101                       |- match_part
 102
 103 and for alignments it is
 104
 105   match          - spliced alignment of an EST/protein to the ref seq
 106    |- match_part - each exon in the spliced alignment
 107
 108 Note that the alignments (match and match_part features) will only be
 109 attached to the gene features if attach_alignments() is set to true.
 110 Attaching the aligments to the gene features requires keeping all of
 111 the alignments in memory, and if you're parsing a very large
 112 GenomeThreader report, that may not be possible.  Therefore,
 113 attach_alignments() defaults to off().
 114
 115 =head1 REDUCING MEMORY REQUIREMENTS
 116
 117 If you find yourself running out of memory while parsing large
 118 GenomeThreader output files with this parser, there are a couple of things
 119 you can do to decrease the memory requirements:
 120
 121   1.  do not use the 'both_separate' parsing mode, since it can sometimes
 122       need to keep a large buffer of features in memory
 123   2.  make sure attach_alignments() is set to false (0 or '')
 124
 125 If these two measures are not adequate for you, please explain your
 126 problem on the mailing list, and we may be able to address your
 127 situation.
 128
 129 =head1 FEEDBACK
 130
 131 =head2 Mailing Lists
 132
 133 User feedback is an integral part of the evolution of this and other
 134 Bioperl modules. Send your comments and suggestions preferably to
 135 the Bioperl mailing list.  Your participation is much appreciated.
 136
 137   bioperl-l@bioperl.org              - General discussion
 138   http://bioperl.org/MailList.shtml  - About the mailing lists
 139
 140 =head2 Reporting Bugs
 141
 142 Report bugs to the Bioperl bug tracking system to help us keep track
 143 of the bugs and their resolution. Bug reports can be submitted via
 144 the web:
 145
 146   http://bugzilla.bioperl.org/
 147
 148 =head1 AUTHOR
 149
 150  Robert Buels, rmb32@cornell.edu
 151
 152 =head1 APPENDIX
 153
 154 The rest of the documentation details each of the object methods.
 155 Internal methods are usually preceded with a _
 156
 157 =cut
 158
 159 # Let the code begin...
 160
 161 package Bio::FeatureIO::gthxml;
 162 use strict;
 163 use English;
 164
 165 use base qw(Bio::FeatureIO);
 166
 167 use XML::Twig;
 168
 169 use Bio::SeqFeature::Annotated;
 170 use Bio::Annotation::Target;
 171
 172 =head2 new()
 173
 174   constructor options for this parser are:
 175
 176   -mode
 177      takes same arguments as mode() accessor below
 178   -attach_alignments
 179      takes same arguments as attach_alignments() accessor below
 180
 181 =cut
 182
 183 sub _initialize {
 184   my($self,%arg) = @_;
 185
 186   $self->SUPER::_initialize(%arg);
 187
 188   #init buffers
 189   $self->{alignment_buffer} = [];
 190   $self->{feature_buffer} = [];
 191
 192   #set defaults
 193   $arg{-mode} ||= 'pgls';
 194   $arg{-attach_alignments} = 0 unless defined $arg{-attach_alignments};
 195
 196   #set options
 197   $self->mode($arg{-mode});
 198   $self->attach_alignments($arg{-attach_alignments});
 199
 200
 201   my $alignment_buffer = $self->mode eq 'both_merged' || $self->mode eq 'alignments_merged'
 202     ? $self->{feature_buffer}
 203     : $self->{alignment_buffer};
 204   my @spliced_alignment_twig = $self->mode eq 'pgls'
 205     ? ()
 206     : (spliced_alignment => sub {
 207          push @$alignment_buffer,
 208            $self->_parse_alignment(@_);
 209          shift->purge;
 210        });
 211   my @pgl_twig  = $self->mode eq 'alignments' || $self->mode eq 'alignments_merged'
 212     ? ()
 213     : (predicted_gene_location => sub {
 214          push @{$self->{feature_buffer}},
 215            $self->_parse_pgl(@_);
 216          shift->purge;
 217        });
 218
 219   #now parse the entire input file, buffering pgls and alignments
 220   eval {
 221     XML::Twig->new( twig_roots =>
 222                     {
 223                      @spliced_alignment_twig,
 224                      @pgl_twig,
 225                     }
 226                   )->parse($self->_fh);
 227   };
 228   if( $EVAL_ERROR ) {
 229     $self->throw("error parsing GTH XML file '".$self->file."': $EVAL_ERROR");
 230   }
 231
 232 }
 233
 234 =head2 mode()
 235
 236   Usage: $fio->mode('alignments');
 237          #parse and return only alignments
 238   Desc : get/set the current parsing mode. Defaults to
 239          'pgls'
 240   Ret  : currently set parse mode
 241   Args : (optional) new parse mode, which is one of 'pgls',
 242          'alignments', 'both_separate', or 'both_merged'
 243
 244 =cut
 245
 246 sub mode {
 247   my ($self,$newmode) = @_;
 248   if(defined $newmode) {
 249     grep {$_ eq $newmode} qw/alignments pgls both_separate both_merged alignments_merged/
 250       or $self->throw("invalid mode selection $newmode");
 251     $self->{parse_mode} = $newmode;
 252   }
 253   return $self->{parse_mode};
 254 }
 255
 256 =head2 attach_alignments()
 257
 258   Usage: $fio->attach_alignments(1);
 259   Desc : get/set a flag telling the parser whether to attach
 260          the supporting alignment features to each AGS (gene
 261          feature) returned.
 262          Off by default.
 263          Attaches the alignments under a tag/Annotation
 264          named 'supporting_alignment'.
 265   Ret  : current value of this flag.
 266   Args : (optional) new true/false value of this flag
 267
 268 =cut
 269
 270 sub attach_alignments {
 271   my ($self,$newval) = @_;
 272   if(defined $newval) {
 273     $self->{attach_alignments} = $newval;
 274   }
 275   return $self->{attach_alignments};
 276 }
 277
 278
 279 =head2 next_feature()
 280
 281  Usage   : my $feature = $featureio->next_feature();
 282  Function: returns the next available feature, which is either a PGL
 283            or an alignment, depending on mode() (see above).  Features
 284            will be returned in the same order as they appear in the file.
 285  Returns : a predicted gene location (PGL) feature, of class
 286            L<Bio::SeqFeature::Annotated>
 287  Args    : none
 288
 289 =cut
 290
 291 # next_feature
 292 #    return 'pgl/alignment' if in buffer
 293 #    else buffer 'pgl/alignment' and return it
 294 sub next_feature {
 295   my ($self,$featuretype) = @_;
 296   $featuretype ||= 'pgl';
 297
 298   $self->_validate_settings;
 299
 300   return $self->next_alignment if $featuretype eq 'alignment';
 301
 302   $self->throw("invalid featuretype '$featuretype'") unless $featuretype eq 'pgl';
 303
 304   return shift @{$self->{feature_buffer}};
 305 }
 306
 307 =head2 next_alignment()
 308
 309   Usage: my $feature = $fio->next_alignment
 310   Desc : get the next EST/protein alignment feature from the
 311          GenomeThreader report.  The alignments will be returned in
 312          the same order in which they appear in the file.
 313   Ret  : an alignment feature, or undef if there are no more
 314   Args : none
 315
 316 =cut
 317
 318 # next_alignment
 319 #    alias for next_feature('alignment')
 320 sub next_alignment {
 321   my ($self) = @_;
 322
 323   $self->_validate_settings;
 324
 325   return shift @{$self->{alignment_buffer}};
 326 }
 327
 328 sub _validate_settings {
 329   my $self = shift;
 330   #validate attach_alignments and mode selections
 331   if( $self->attach_alignments && $self->mode !~ /^both_/ ) {
 332     $self->throw("mode must be set to 'both_separate' or 'both_merged' if attach_alignments() is set to true");
 333   }
 334 }
 335
 336 =head2 write_feature()
 337
 338 Not implemented.
 339
 340 =cut
 341
 342 sub write_feature {
 343   shift->throw_not_implemented;
 344 }
 345
 346 #given the featureio object, an XML::Twig object, and an XML::Twig
 347 #element representing a <spliced_alignment> element, make a
 348 #feature for this alignment
 349 sub _parse_alignment {
 350   my ($self,$twig,$element) = @_;
 351
 352   ### parse out some basic things about the main alignment
 353   my $matchline  = $element->first_descendant('MATCH_line');
 354   my ($seq_id,$target_id) = map {$matchline->att($_)} qw/gen_id ref_id/;
 355   my $score = $matchline->first_child('total_alignment_score')->text;
 356
 357   ### get the strandedness w.r.t. the reference seq
 358   my $rstrand = $element->first_child('reference')->att('ref_strand');
 359
 360   #make features for all the sub-alignments
 361   my @rstarts;
 362   my @rends;
 363   my ($start,$end);
 364   my @subfeats = map {
 365     my $gen = $_->first_child('gDNA_exon_boundary');
 366     my $ref = $_->first_child('reference_exon_boundary');
 367     my ($gstart,$gend,$gstrand) = _start_end_strand($gen->att('g_start'), $gen->att('g_stop'));
 368     my ($rstart,$rend) = ($ref->att('r_start'), $ref->att('r_stop'));
 369     push @rstarts,$rstart; push @rends,$rend;
 370     my $subscore = $ref->att('r_score');
 371     $start = $gstart if ! defined $start || $start > $gstart;
 372     $end = $gend if ! defined $end || $end < $gend;
 373     $self->_new_feature( -start  => $gstart, -end => $gend, -strand => $gstrand, -score => $subscore,
 374                          -seq_id => $seq_id,
 375                          -type   => 'match_part',
 376                          -target => { -target_id => $target_id,
 377                                       -start     => $rstart, -end => $rend, -strand => $rstrand,
 378                                     },
 379                        );
 380   } $element->first_child('predicted_gene_structure')->first_child('exon-intron_info')->children('exon');
 381
 382   my $strand = $subfeats[0]->strand;
 383
 384   #now make the main alignment
 385   my $alignment = $self->_new_feature( -start  => $start, -end => $end, -strand => $strand,
 386                                        -score  => $score,
 387                                        -seq_id => $seq_id,
 388                                        -type   => 'match',
 389                                        -target  => { -target_id => $target_id,
 390                                                      -start     => (sort {$a<=>$b} @rstarts)[0],
 391                                                      -end       => (sort {$b<=>$a} @rends)[0],
 392                                                      -strand    => $rstrand,
 393                                                    },
 394                                        -subfeatures => \@subfeats,
 395                                      );
 396
 397   #if we're attaching alignments to the PGLs that use them,
 398   #index this alignment for later use
 399   if( $self->attach_alignments ) {
 400     my $pgs_sig = $self->_pgs_line_to_sig($element->first_descendant('PGS_line'));
 401     $self->_index_alignment($pgs_sig,$alignment);
 402   }
 403
 404   return $alignment;
 405 }
 406
 407 #given a <PGS_line> element and an optional ID of the genomic sequence
 408 #we're looking at, either from a PGL or an alignment section, parse it
 409 #and return a string that can serve as the unique key for a hash of
 410 #alignments
 411 sub _pgs_line_to_sig {
 412   my ($self,$pgsline_element,$genomic_id) = @_;
 413   my $ref_id = do {
 414     if(my $elem = $pgsline_element->first_child('rDNA')) {
 415       $elem->att('rDNA_id')
 416     }
 417     elsif($elem = $pgsline_element->first_child('referenceDNA') ) {
 418       $elem->att('id')
 419     }
 420   };
 421   $genomic_id ||= ($pgsline_element->first_child('gDNA') or $self->throw('parse error'))->att('gen_id');
 422   return join ',',
 423     ( $ref_id,
 424       $genomic_id,
 425       map {
 426         ( $_->att('e_start') || $_->att('start'),
 427           $_->att('e_stop') || $_->att('stop'),
 428         )
 429       } $pgsline_element->descendants('exon'),
 430     );
 431 }
 432
 433 #given the featureio object, the twig object, and a twig element for a
 434 #<predicted_gene_location>, make a feature and subfeatures for that
 435 #pgl
 436 sub _parse_pgl {
 437   my ($self,$twig,$element) = @_;
 438
 439   my @subfeats = map {
 440     $self->_parse_ags($twig,$_);
 441   } $element->children('AGS_information');
 442
 443   return () unless @subfeats;
 444
 445   #parse out the start, end, and strand of this PGL
 446   my ($start,$end,$strand) = do {
 447     my $pgl_line = $element->first_child('PGL_line');
 448     map {$pgl_line->att($_)} qw/PGL_start PGL_stop PGL_strand/;
 449   };
 450   my $strand2;
 451   ($start,$end,$strand2) = _start_end_strand($start,$end);
 452   $strand2 = $strand2 == 1 ? '+' : $strand2 == -1 ? '-' : $self->throw("invalid strand $strand2");
 453   $strand2 eq $strand or $self->throw("parsing consistency error ('$strand' vs '$strand2'");
 454
 455   my $pgl_feature =  $self->_new_feature( -start  => $start,
 456                                           -end    => $end,
 457                                           -strand => $strand,
 458                                           -seq_id => $subfeats[0]->seq_id,
 459                                           -type   => 'gene',
 460                                           -subfeatures => \@subfeats,
 461                                         );
 462   if( $self->attach_alignments ) {
 463     foreach my $pgsline ($element->first_descendant('supporting_evidence')->children('PGS_line')) {
 464       #use the <PGS_line> to look up the alignment features that were
 465       #used for this, and attach them to this feature
 466       my $pgs_sig = $self->_pgs_line_to_sig($pgsline,$subfeats[0]->seq_id);
 467       my $alignment_feature = $self->_get_indexed_alignment($pgs_sig)
 468         or $self->throw("parse error, cannot look up alignment with signature '$pgs_sig', available sigs are\n".join("\n",keys %{$self->{alignment_features}}));
 469       $pgl_feature->add_Annotation('supporting_alignment',
 470                                    Bio::Annotation::SimpleValue->new( -value => $alignment_feature ),
 471                                   );
 472     }
 473   }
 474   return $pgl_feature;
 475 }
 476
 477 sub _parse_ags {
 478   my ($self,$twig,$element) = @_;
 479
 480
 481   my $seq_id = do {
 482     if(my $gdna = $element->first_descendant('gDNA')) {
 483       $gdna->att('id');
 484     }
 485     elsif($gdna = $element->first_descendant('none')) {
 486       $gdna->att('gDNA_id');
 487     }
 488   };
 489   unless($seq_id) {
 490     $self->warn('WARNING: could not determine genomic sequence ID for AGS with no probable_ORFs.  This XML output issue is fixed in genomethreader 0.9.54 and later, please consider upgrading.');
 491     return ();
 492   }
 493
 494
 495   my @exons = map {
 496     my $e = $_;
 497     my $boundary = $e->first_child('gDNA_exon_boundary');
 498     my ($start,$end,$strand) = _start_end_strand($boundary->att('e_start'), $boundary->att('e_stop'));
 499     my $score = $e->att('e_score');
 500     $self->_new_feature( -start => $start, -end => $end, -strand => $strand,
 501                          -score => $score,
 502                          -seq_id=>$seq_id,
 503                          -type => 'exon',
 504                        );
 505   } $element->first_child('exon-intron_info')->children('exon');
 506
 507   my ($start,$end,$strand) = do {
 508     my $coords = $element->first_child('AGS_line')->first_child('exon_coordinates');
 509     _start_end_strand( $coords->first_child('exon')->att('e_start'),
 510                        $coords->last_child('exon')->att('e_stop'),
 511                      )
 512   };
 513
 514   return $self->_new_feature( -start  => $start,
 515                               -end => $end,
 516                               -strand => $strand,
 517                               -seq_id => $seq_id,
 518                               -type   => 'mRNA',
 519                               -subfeatures => \@exons,
 520                             );
 521 }
 522
 523 #object method to create a new feature object, with some defaults and
 524 #automation of the more repetitive bits (like adding targets and
 525 #subfeatures)
 526 sub _new_feature(@) {
 527   my ($self,%a) = @_;
 528
 529   UNIVERSAL::isa($self,__PACKAGE__)
 530       or die('_new_feature is an object method, silly');
 531
 532 #   use Data::Dumper;
 533 #   warn "got feature: ",Dumper(\%a);
 534
 535   $a{-seq_id} ||= $self->{current_genomic_sequence};
 536
 537   #replace spaces in source with underscores
 538   $a{-source} ||= 'GenomeThreader';
 539   $a{-source} =~ s/\s+/_/g;
 540
 541   #if no strand is given, make the proper strand and flip start and
 542   #end if necessary
 543   @a{qw/-start -end -strand/} = _start_end_strand(@a{qw/-start -end/}) unless $a{-strand};
 544
 545   #do some type mapping
 546   my %type_map = ( similarity => 'nucleotide_motif',
 547                    exon       => delete($a{is_alignment}) ? 'match_part' : 'exon',
 548                  );
 549   $a{-type} =  $type_map{ $a{-type}} || $a{-type};
 550
 551   #intercept target if present
 552   my $target = do {
 553     if(my $t = delete $a{-target}) {
 554       unless( $t->{-strand} ) {
 555         @{$t}{qw/-start -end -strand/} = _start_end_strand(@{$t}{qw/-start -end/});
 556       }
 557
 558       #HACK fix up target ids for alignments
 559       if( $t->{-target_id} =~ s/([+-])$//) {
 560         $t->{-strand} = $1;
 561       }
 562
 563       Bio::Annotation::Target->new(%$t);
 564     }
 565   };
 566   my $feature = Bio::SeqFeature::Annotated->new(%a);
 567   $feature->add_Annotation('Target',$target) if $target;
 568   $feature->add_Annotation('ID',Bio::Annotation::SimpleValue->new( -value => $a{id} ) ) if $a{id};
 569   if( ref $a{supporting_alignments} eq 'ARRAY' ) {
 570     foreach ( @{$a{-subfeatures}} ) {
 571       $feature->add_Annotation('supporting_alignment',Bio::Annotation::SimpleValue->new( -value => $_ ));
 572     }
 573   }
 574   return $feature;
 575 }
 576
 577
 578 #object method that, given two (one-based) coordinates on a sequence
 579 #and its length, returns the reverse complement of the coordinates in
 580 #a two-element list
 581 sub _reverse_complement_coords {
 582   my ($s,$e,$l) = @_;
 583   return ( $l-$e+1, $l-$s+1 );
 584 }
 585
 586 #make a gff3-compliant feature start, end, and strand
 587 #from a gamexml-style start and end that might be backwards
 588 sub _start_end_strand(@) {
 589   my ($start,$end) = @_;
 590   if($start > $end) {
 591     return ($end,$start,-1);
 592   } else {
 593     return ($start,$end,1);
 594   }
 595 }
 596
 597 # lookup_alignment
 598 # get_alignment
 599 # purge_alignments
 600 sub _index_alignment {
 601   my ($self,$key,$feature) = @_;
 602 #  warn "indexing with '$key'\n";
 603   $self->{alignment_features}{$key} = $feature;
 604 }
 605 sub _purge_alignment_index {
 606   shift->{alignment_features} = {};
 607 }
 608 sub _get_indexed_alignment {
 609   my ($self,$key) = @_;
 610   return $self->{alignment_features}{$key};
 611 }
 612
 613 #chop the strand (+ or -) off of a seq name and return it as 1 for +
 614 #or -1 for - note that this has the side effect of modifying the value
 615 #of the thing that's passed to it
 616 sub _chop_strand {
 617   my ($ident) = @_;
 618   my $strand = chop $$ident;
 619   if( $strand eq '+' ) {
 620     return 1;
 621   }
 622   elsif( $strand eq '-' ) {
 623     return -1;
 624   }
 625   die "Invalid strand '$strand' at end of identifier '$$ident$strand'";
 626 }
 627
 628 ###
 629 1;# do not remove
 630 ###
 631