lib/Bio/FeatureIO/geneseqer.pm

   1 =pod
   2
   3 =head1 NAME
   4
   5 Bio::FeatureIO::geneseqer - FeatureIO parser for GeneSeqer reports
   6
   7 =head1 SYNOPSIS
   8
   9   my $feature_in = Bio::FeatureIO->new(-format => 'geneseqer',
  10                                        -file   => 'myfile.geneseqer.out',
  11                                       );
  12
  13   $feature_in->attach_alignments(1); #attach alignments to their AGSs
  14
  15   $feature_in->mode('both_separate'); #parse both alignments and PGLs from the
  16                                       #report
  17
  18   while(my $feat = $feature_in->next_alignment) {
  19     #do something with alignment feature
  20   }
  21
  22   while(my $feat = $feature_in->next_feature) {
  23
  24     #get the alignment objects attached to the PGL feature
  25     my @alignments =
  26       map {
  27         $_->value
  28       } $feat->annotation->get_Annotations('supporting_alignment');
  29
  30     foreach my $alignment (@alignments) {
  31       # $alignment is now a feature representing one of the EST
  32       # alignments used to compute this PGL
  33     }
  34
  35   }
  36
  37
  38 =head1 DESCRIPTION
  39
  40 Parses GeneSeqer reports, returning L<Bio::SeqFeature::Annotated>
  41 objects.  Alignments are represented by a
  42 L<Bio::SeqFeature::Annotated> object with a 'Target' annotation,
  43 holding a L<Bio::Annotation::Target> object.  PGLs are also
  44 L<Bio::SeqFeature::Annotated> objects.
  45
  46 The mode() accessor controls the parsing mode, which is one of:
  47
  48 =over 4
  49
  50 =item 'pgls'
  51
  52 Only predicted gene location (PGL) features will be parsed and
  53 returned.  Supporting alignments in the file will be ignored.
  54
  55 =item 'alignments'
  56
  57 Only supporting EST/protein alignments will be returned, PGLs will be
  58 ignored.
  59
  60 =item 'both_separate'
  61
  62 Both PGLs and alignments will be parsed.  next_feature() will return
  63 PGLs only, and next_alignment() will return the alignments
  64
  65 =item 'both_merged'
  66
  67 Both PGLs and alignments will be parsed.  next_feature() will return
  68 B<both> PGLs and alignments.  next_alignment will always return undef
  69
  70 =back
  71
  72 If the parse mode is set to 'both_separate' or 'both_merged', you can
  73 also set the accessor attach_alignments() to have the supporting
  74 alignment features attached to the AGS features in which they are
  75 used.  These are attached under the Annotation key/tag
  76 'supporting_alignment'.  This is off by default.
  77
  78 In the future, this module may be enhanced to have the capability of
  79 producing L<Bio::SeqFeature::Generic> and
  80 L<Bio::SeqFeature::FeaturePair> objects.
  81
  82 =head1 STRUCTURE OF RETURNED FEATURES
  83
  84 This module returns a bunch of hierarchical features (that is, the
  85 features have subfeatures) that represent the GeneSeqer output.  The
  86 feature hierarchy for PGLs is arranged as:
  87
  88   region                    - the GeneSeqer predicted gene location (PGL)
  89    |- processed_transcript  - each alternative gene structure (AGS) in the PGL
  90    |  |- CDS                - each exon in that AGS
  91       |- (optional) match           - alignment, see below
  92                      |- match_part
  93
  94 and for alignments it is
  95
  96   match          - spliced alignment of an EST/protein to the ref seq
  97    |- match_part - each exon in the spliced alignment
  98
  99 Note that the alignments (match and match_part features) will only be
 100 attached to the gene features if attach_alignments() is set to true.
 101
 102 =head1 REDUCING MEMORY REQUIREMENTS
 103
 104 If you find yourself running out of memory while parsing large
 105 GeneSeqer output files with this parser, there are a couple of things
 106 you can do to decrease the memory requirements:
 107
 108   1.  do not use the 'both_separate' parsing mode, since it can sometimes
 109       need to keep a large buffer of features in memory
 110   2.  make sure attach_alignments() is set to false (0 or '')
 111
 112 If these two measures are not adequate for you, please explain your
 113 problem on the mailing list, and we may be able to address your
 114 situation.
 115
 116 =head1 FEEDBACK
 117
 118 =head2 Mailing Lists
 119
 120 User feedback is an integral part of the evolution of this and other
 121 Bioperl modules. Send your comments and suggestions preferably to
 122 the Bioperl mailing list.  Your participation is much appreciated.
 123
 124   bioperl-l@bioperl.org              - General discussion
 125   http://bioperl.org/MailList.shtml  - About the mailing lists
 126
 127 =head2 Reporting Bugs
 128
 129 Report bugs to the Bioperl bug tracking system to help us keep track
 130 of the bugs and their resolution. Bug reports can be submitted via
 131 the web:
 132
 133   http://bugzilla.bioperl.org/
 134
 135 =head1 AUTHOR
 136
 137  Robert Buels, rmb32@cornell.edu
 138
 139 =head1 APPENDIX
 140
 141 The rest of the documentation details each of the object methods.
 142 Internal methods are usually preceded with a _
 143
 144 =cut
 145
 146 # Let the code begin...
 147
 148 package Bio::FeatureIO::geneseqer;
 149 use strict;
 150
 151 use base qw(Bio::FeatureIO);
 152
 153 use Bio::SeqFeature::Annotated;
 154
 155 =head2 new()
 156
 157   constructor options for this parser are:
 158
 159   -mode
 160      takes same arguments as mode() accessor below
 161   -attach_alignments
 162      takes same arguments as attach_alignments() accessor below
 163
 164 =cut
 165
 166 sub _initialize {
 167   my($self,%arg) = @_;
 168
 169   $self->SUPER::_initialize(%arg);
 170
 171   #init buffers
 172   $self->{alignment_buffer} = [];
 173   $self->{feature_buffer} = [];
 174
 175   #set defaults
 176   $arg{-mode} ||= 'pgls';
 177   $arg{-attach_alignments} = 0 unless defined $arg{-attach_alignments};
 178
 179   #set options
 180   $self->mode($arg{-mode});
 181   $self->attach_alignments($arg{-attach_alignments});
 182 }
 183
 184 =head2 mode()
 185
 186   Usage: $fio->mode('alignments');
 187          #parse and return only alignments
 188   Desc : get/set the current parsing mode. Defaults to
 189          'pgls'
 190   Ret  : currently set parse mode
 191   Args : (optional) new parse mode, which is one of 'pgls',
 192          'alignments', 'both_separate', or 'both_merged'
 193
 194 =cut
 195
 196 sub mode {
 197   my ($self,$newmode) = @_;
 198   if(defined $newmode) {
 199     grep {$_ eq $newmode} qw/alignments pgls both_separate both_merged/
 200       or $self->throw("invalid mode selection $newmode");
 201     $self->{parse_mode} = $newmode;
 202   }
 203   return $self->{parse_mode};
 204 }
 205
 206 =head2 attach_alignments()
 207
 208   Usage: $fio->attach_alignments(1);
 209   Desc : get/set a flag telling the parser whether to attach
 210          the supporting alignment features to each AGS (gene
 211          feature) returned.
 212          Off by default.
 213          Attaches the alignments under a tag/Annotation
 214          named 'supporting_alignment'.
 215   Ret  : current value of this flag.
 216   Args : (optional) new true/false value of this flag
 217
 218 =cut
 219
 220 sub attach_alignments {
 221   my ($self,$newval) = @_;
 222   if(defined $newval) {
 223     $self->{attach_alignments} = $newval;
 224   }
 225   return $self->{attach_alignments};
 226 }
 227
 228 =head2 next_feature()
 229
 230  Usage   : my $feature = $featureio->next_feature();
 231  Function: reads the next Predicted Gene Location from the given
 232            geneseqer file, returning it as an object.  The PGLs
 233            will be returned in the same order in which they appear
 234            in the GeneSeqer output.
 235  Returns : a predicted gene location (PGL) feature, of class
 236            L<Bio::SeqFeature::Annotated>
 237  Args    : none
 238
 239 =cut
 240
 241 # next_feature
 242 #    return 'pgl/alignment' if in buffer
 243 #    else buffer 'pgl/alignment' and return it
 244 sub next_feature {
 245   my ($self,$featuretype) = shift;
 246   $featuretype ||= 'pgl';
 247
 248   #validate attach_alignments and mode selections
 249   if( $self->attach_alignments && $self->mode !~ /^both/ ) {
 250     $self->throw("mode must be set to 'both_merged' or 'both_separate' if attach_alignments() is set to true");
 251   }
 252
 253   return $self->next_alignment if $featuretype eq 'alignment';
 254
 255   $self->throw("invalid featuretype '$featuretype'") unless $featuretype eq 'pgl';
 256
 257   #now down to business
 258
 259   #get a pgl into the buffer if necessary
 260   unless( @{$self->{feature_buffer}} ) {
 261     $self->_buffer_item($self->mode eq 'both_merged' ? 'either' : 'pgl');
 262   }
 263
 264   return shift @{$self->{feature_buffer}};
 265 }
 266
 267 =head2 next_alignment()
 268
 269   Usage: my $feature = $fio->next_alignment
 270   Desc : get the next EST/protein alignment feature from the
 271          GeneSeqer report.  The alignments will be returned in
 272          the same order in which they appear in the file.
 273   Ret  : an alignment feature, or undef if there are no more
 274   Args : none
 275
 276 =cut
 277
 278 # next_alignment
 279 #    alias for next_feature('alignment')
 280 sub next_alignment {
 281   my ($self) = @_;
 282
 283   #validate attach_alignments and mode selections
 284   if( $self->attach_alignments && $self->mode !~ /^both_/ ) {
 285     $self->throw("mode must be set to 'both_separate' or 'both_merged' if attach_alignments() is set to true");
 286   }
 287
 288   #get an alignment into the buffer if necessary
 289   unless( @{$self->{alignment_buffer}} ) {
 290     $self->_buffer_item('alignment');
 291   }
 292
 293   return shift @{$self->{alignment_buffer}};
 294 }
 295
 296 =head2 write_feature()
 297
 298 Not implemented.
 299
 300 =cut
 301
 302 sub write_feature {
 303   shift->throw_not_implemented;
 304 }
 305
 306 #central repository for match patterns
 307 sub _pattern {
 308   my ($self,$patname) = @_;
 309   my %patterns =
 310     (
 311      pgl      => qr/^\s*PGL\s+\d+\s+\(/,
 312      ags      => qr/^\s*AGS-\d+\s/,
 313      sequence => qr/^\s*Sequence\s+\d+\s*:\s+(.+),\s+from/,
 314      align    => qr/^\s*\*{10}/,
 315      finished => qr/^[^A-Za-z]+finished\s/,
 316     );
 317   return $patterns{$patname};
 318 }
 319
 320 # buffer_item (central parsing loop)
 321 #    read from file.  if mode says we're using it, parse the thing and buffer it.
 322 #                     otherwise, **skip** it
 323 #    return 1 when you finally buffer a thing of the type your arg says you're looking for
 324 #    return 0 if EOF
 325 sub _buffer_item {
 326   my ($self,$requested_type) = @_;
 327   grep {$requested_type eq $_} qw/ alignment pgl either /
 328     or $self->throw("invalid requested type '$requested_type'");
 329
 330   ### MAIN PARSING LOOP detects the beginnings of sections, pushes
 331   # them back on the buffer, then calls the appropriate parsing
 332   # routine for them.  returns only if a.) it successfully finds and
 333   # parses an item of the requested type or b.) it reaches EOF
 334   while( my $line = $self->_readline ) {
 335 #     chomp $line;
 336 #     warn "decide line $.: $line\n";
 337     #ifs are sorted in decreasing order of how common a case they are
 338     if(    $self->mode ne 'pgls'       &&  $line =~ $self->_pattern('align') ) {
 339       $self->_pushback($line);
 340       $self->_buffer_alignment;
 341       return 1 if $requested_type eq 'alignment' || $requested_type eq 'either';
 342     }
 343
 344     elsif( $self->mode ne 'alignments' && $line =~ $self->_pattern('pgl')   ) {
 345       $self->_pushback($line);
 346       $self->_buffer_pgl;
 347       return 1 if $requested_type eq 'pgl' || $requested_type eq 'either';
 348     }
 349
 350     elsif( $line =~ $self->_pattern('sequence') ) {
 351       $self->_pushback($line);
 352       $self->_purge_alignment_index;
 353       $self->_parse_sequence_section;
 354     }
 355   }
 356 }
 357
 358 # buffer_alignment
 359 #    assumes starting at the beginning of alignment section.
 360 #    parse the alignment section, and put the alignment feature in the alignment buffer
 361 #    if we're attaching, also index the alignment by its signature
 362
 363 #parse an 'EST sequence' section of the report, up to and
 364 #including the '^MATCH ' line, returning a feature with
 365 #subfeatures representing the spliced cDNA alignment
 366 sub _buffer_alignment {
 367   my ($self) = @_;
 368
 369   my $buffer_for_alignments = $self->mode eq 'both_merged'
 370     ? $self->{feature_buffer}
 371     : $self->{alignment_buffer};
 372
 373   #these uninitialized variables are filled in by the parsing below
 374   my $parent_feature;
 375   my @subfeatures;
 376
 377   my $cstrand; #strandedness of the EST we're currently looking at, either '+' or '-'
 378   my $cid; #identifier of the EST we're looking at
 379   my $clength; #length of the EST we're looking at
 380
 381   my %patterns = (
 382                   hqpgs   => qr/^\s*hqPGS_(\S+_\S+[+-])/,
 383                   est_seq => qr/^\s*EST\s+sequence\s+\d+\s+([+-])\s*strand\s+(?:(\d+)\s+n\s+)?\(\s*File:\s*([^\)\s]+)\s*\)/,
 384                   exon    => qr/^\s+Exon\s/,
 385                   match   => qr/^MATCH/,
 386                   seqline => qr/^\s+(\d+)\s+([ACTG ]+)/,
 387                  );
 388
 389   while( my $line = $self->_readline ) {
 390 #     chomp $line;
 391 #     warn "cdna parsing '$line'\n";
 392     #EST sequence introduction
 393     if( $line =~ $patterns{est_seq} ) {
 394 #      warn "cdna est line\n";
 395       #parse out the strand, length, and ID of the EST we're dealing with,
 396       #storing them in the lexicals above
 397       $cstrand = $1;
 398       $cid = $3;
 399       $clength = $2;
 400       $cstrand eq chop $cid or $self->throw("inconsistent strandedness in line '$line'");
 401       $cstrand = $cstrand eq '+' ?  1 :
 402                  $cstrand eq '-' ? -1 : $self->throw("Unknown strand direction '$cstrand'");
 403
 404       #do a little error checking
 405       $cid or $self->throw("can't parse sequence identifier from line '$line'");
 406       $cstrand == 1 or $cstrand == -1 or $self->throw("can't parse EST alignment strand from line '$line'");
 407     }
 408     #try to figure out the est's length from the sequence lines
 409     #if it wasn't given in the 'EST sequence' line
 410     elsif( $line =~ $patterns{seqline} ) {
 411       my $startlength = $1;
 412       my $residues = $2;
 413       $residues =~ s/\s//g;
 414       my $newlength = $startlength + length($residues) - 1;
 415       unless($clength >= $newlength) {
 416         $clength = $newlength;
 417 #       chomp $line;
 418 #       warn "faked up clength $clength from line '$line'\n";
 419       }
 420     }
 421     #Exon line
 422     elsif( $line =~ $patterns{exon} ) {
 423       #parse out the genomic and cDNA start,end,length, and the score of the match
 424       my (undef,$gstart,$gend,undef,$cstart,$cend,undef,$exon_score)
 425         = _parse_numbers($line);
 426
 427       #if we're looking at a reverse-complemented EST, un-reverse-complement its start and end coordinates
 428       unless($cstrand == 1) {
 429         $cstrand == -1 or $self->throw("invalid strandedness '$cstrand'");
 430         $clength > 0 or $self->throw("can't parse EST length from line '$line'");
 431         ($cstart,$cend) = _reverse_complement_coords($cstart,$cend,$clength);
 432       }
 433
 434       my $subfeature =  $self->_new_feature( -start  => $gstart,
 435                                              -end    => $gend,
 436                                              -score  => $exon_score,
 437                                              -type   => 'match_part',
 438                                              -source => 'GeneSeqer',
 439                                              -target => { -target_id => $cid,
 440                                                           -start     => $cstart,
 441                                                           -end       => $cend,
 442                                                           -strand    => $cstrand,
 443                                                         },
 444                                            ); #note: we'll go through
 445                                               #and set the seq_id
 446                                               #later, when we get it
 447                                               #from the MATCH line
 448       push @subfeatures,$subfeature;
 449     }
 450     #MATCH line
 451     elsif( $line =~ $patterns{match} ) {
 452       my (undef,$genomic_seqname,$cdna_seqname,$alignment_score) = split /\s+/,$line;
 453       my $gstrand = _chop_strand(\$genomic_seqname);
 454       my $other_cstrand = _chop_strand(\$cdna_seqname);
 455       $cstrand eq $other_cstrand or $self->throw("inconsistent cDNA strandedness: $cstrand vs $other_cstrand");
 456
 457       #go back over the subfeatures and error check, figure out the
 458       #parent feature start and end, and set the appropriate seq_id on
 459       #each subfeature
 460       my $gstart;
 461       my $gend;
 462       foreach my $subfeature (@subfeatures) {
 463         #check that the strandedness is correct
 464         $subfeature->strand eq $gstrand or $self->throw("parser bug, inconsistent strands found ('".$subfeature->strand."' vs '$gstrand')");
 465         #set the proper seq_id
 466         $subfeature->seq_id($genomic_seqname);
 467         #set parent feature limits
 468         $gstart = $subfeature->start unless $gstart && $gstart < $subfeature->start;
 469         $gend = $subfeature->end unless $gend && $gend > $subfeature->end;
 470       }
 471
 472       #make an overall feature for this EST alignment
 473       $parent_feature = $self->_new_feature( -start  => $gstart,
 474                                              -end    => $gend,
 475                                              -seq_id => $genomic_seqname,
 476                                              -source => 'GeneSeqer',
 477                                              -score  => $alignment_score,
 478                                              -strand => $gstrand,
 479                                              -type   => 'match',
 480                                            );
 481
 482       #add all the subfeatures to it, reversing so they'll come out in the right order
 483       $parent_feature->add_SeqFeature($_) foreach @subfeatures;
 484     }
 485     elsif( $line =~ $patterns{hqpgs} ) {
 486
 487       #do some consistency checks with the contents of the PGS line
 488       my (undef,$other_cid) = split /_/, $1;
 489
 490       if( $self->attach_alignments ) {
 491         #now make the pgs signature string, and return it with the parent feature we made
 492         my (undef,@pgs_stuff) = split /\s+/,$line;
 493
 494         #$pgs_signature a string GeneSeqer uses to uniquely identify
 495         #this alignment.  We'll use these later to connect the PGLs to
 496         #their supporting alignments
 497         my $pgs_signature = join(' ',@pgs_stuff,$other_cid);
 498         #      $parent_feature->add_Annotation('comment',Bio::Annotation::Comment->new(-text => $pgs_signature));
 499         $self->_index_alignment($pgs_signature,$parent_feature);
 500       }
 501
 502       $parent_feature or $self->throw("parse error");
 503
 504       push @$buffer_for_alignments, $parent_feature;
 505       return;
 506     }
 507   }
 508 }
 509
 510 #parse the section of the report introducing the processing of a new
 511 #genomic sequence.  right now, just par
 512 sub _parse_sequence_section {
 513   my ($self) = @_;
 514   my $line = $self->_readline;
 515
 516   #parse the sequence identifier out of the line
 517   $line =~ $self->_pattern('sequence')
 518     or $self->throw("improper call of _parse_sequence_section, line '$line' does not match expected pattern");
 519
 520 #  warn "got new seq $1\n";
 521
 522   #store the sequence identifier in the current object.
 523   #_new_feature() uses this as the seq_id to return
 524   $self->{current_genomic_sequence} = $1
 525     or $self->throw("improper 'sequence' pattern in geneseqer parser.  it does not capture the sequence identifier from the Sequence line in \$1");
 526 }
 527
 528 # buffer_pgl
 529 #    assumes starting at the beginning of pgl section
 530 #    parse the pgl section, put the pgl feature in the pgl buffer
 531 #    if attach_alignments() is set, remember to attach the associated
 532 #    alignments, getting them from the alignment index
 533 sub _buffer_pgl {
 534   my $self = shift;
 535
 536   while(my $line = $self->_readline) {
 537     chomp $line;
 538 #    warn "pgl parse $.: $line\n";
 539     if( $line =~ $self->_pattern('pgl')) {
 540 #      warn $line;
 541       my (undef,$pgl_start,$pgl_end) = _parse_numbers($line);
 542       my @ags; # list of AGSs in the current PGL.  filled in by the
 543                # parsing below
 544       my %alignments; #this is a hash by signature of alignments that
 545                       #are members of this PGL
 546       while( $line = $self->_readline) {
 547         if( $line =~ $self->_pattern('ags') ) {
 548 #         warn $line;
 549           my ($ags_start) = $line =~ /\(\s*(\d+)/ or $self->throw("parse error");
 550           my ($ags_end)   = $line =~ /(\d+)\s*\)/ or $self->throw("parse error");
 551           my %subfeatures = $self->_parse_ags_subfeatures;
 552
 553           push @ags,$self->_new_feature( -start => $ags_start,
 554                                          -end   => $ags_end,
 555                                          -type  => 'processed_transcript',
 556                                          -subfeatures => $subfeatures{exon_features},
 557                                          supporting_alignments => $subfeatures{alignment_features},
 558                                        );
 559         }
 560         elsif( $line =~ $self->_pattern('pgl') || $line =~ $self->_pattern('finished')) {
 561           $self->_pushback($line);
 562           last;
 563         }
 564       }
 565
 566       @ags or $self->throw("no ags found for pgl.  premature end of file?");
 567
 568       push @{$self->{feature_buffer}},
 569         $self->_new_feature( -start  => $pgl_start,
 570                              -end    => $pgl_end,
 571                              -type   => 'region',
 572                              -subfeatures => [@ags, values %alignments],
 573                            );
 574       return;
 575     }
 576   }
 577   $self->throw("parse error, cannot find end of PGL section");
 578 }
 579
 580 #parse a single AGS entry, making subfeatures for their exons, etc
 581 #return a hash-style list as exon_features => [feat,feat...],
 582 # alignment_features => { signature => feat, signature => feat, ...},
 583 sub _parse_ags_subfeatures {
 584   my ($self) = @_;
 585
 586   #these variables are all filled in by the parsing below
 587   my @exon_features;
 588   my @supporting_alignment_features;
 589
 590   while( my $line = $self->_readline ) {
 591     if( $line =~ /^\s*Exon\s/ ) {
 592       my (undef,$gstart,$gend,undef,$exon_score) = _parse_numbers($line);
 593       push @exon_features, $self->_new_feature( -start => $gstart,
 594                                                 -end   => $gend,
 595                                                 -score => $exon_score,
 596                                                 -type  => 'CDS',
 597                                               );
 598     }
 599    elsif( $line =~ /^\s*PGS\s+\(/ ) {
 600      if( $self->attach_alignments ) {
 601      #parse all the PGS's and then break out of the while loop above when done
 602        do {
 603          my (undef,@pgs_tokens) = grep {$_} (split /\s+/,$line);
 604          #      use Data::Dumper;
 605          #      die "got tokens: ",Dumper(\@pgs_tokens);
 606          my $pgs_sig = join ' ',@pgs_tokens;
 607          push @supporting_alignment_features,$self->_get_indexed_alignment($pgs_sig)
 608            || $self->throw("geneseqer parse error: no supporting alignment found with signature '$pgs_sig', line was '$line'");
 609        } while ( $line = $self->_readline and $line =~ /^\s*PGS\s+\(/ );
 610
 611        $self->_pushback($line);
 612
 613        #after the supporting PGS's, we are done parsing this AGS
 614      }
 615      last;
 616    }
 617   }
 618
 619   return ( exon_features      =>  \@exon_features,
 620            alignment_features =>  \@supporting_alignment_features,
 621          );
 622 }
 623
 624 #object method to create a new feature object, with some defaults and
 625 #automation of the more repetitive bits (like adding targets and
 626 #subfeatures)
 627 sub _new_feature(@) {
 628   my ($self,%a) = @_;
 629
 630 #   use Data::Dumper;
 631 #   warn "got feature: ",Dumper(\%a);
 632
 633   $a{-seq_id} ||= $self->{current_genomic_sequence};
 634
 635   #replace spaces in source with underscores
 636   $a{-source} ||= 'GeneSeqer';
 637   $a{-source} =~ s/\s+/_/g;
 638
 639   #if no strand is given, make the proper strand and flip start and end if necessary
 640   @a{qw/-start -end -strand/} = _start_end_strand(@a{qw/-start -end/}) unless $a{-strand};
 641
 642   #do some type mapping
 643   my %type_map = ( similarity => 'nucleotide_motif',
 644                    exon       => delete($a{is_alignment}) ? 'match_part' : 'exon',
 645                  );
 646   $a{-type} =  $type_map{ $a{-type}} || $a{-type};
 647
 648   #intercept target if present
 649   my $target = do {
 650     if(my $t = delete $a{target}) {
 651       unless( $t->{-strand} ) {
 652         @{$t}{qw/-start -end -strand/} = _start_end_strand(@{$t}{qw/-start -end/});
 653       }
 654
 655       #HACK fix up target ids for geneseqer alignments
 656       if( $t->{-target_id} =~ s/([+-])$//) {
 657         $t->{-strand} = $1;
 658       }
 659
 660       Bio::Annotation::Target->new(%$t);
 661     }
 662   };
 663   my $feature = Bio::SeqFeature::Annotated->new(%a);
 664   $feature->add_Annotation('Target',$target) if $target;
 665   $feature->add_Annotation('ID',Bio::Annotation::SimpleValue->new( -value => $a{id} ) ) if $a{id};
 666   if( ref $a{supporting_alignments} eq 'ARRAY' ) {
 667     foreach ( @{$a{-subfeatures}} ) {
 668       $feature->add_Annotation('supporting_alignment',Bio::Annotation::SimpleValue->new( -value => $_ ));
 669 #      $feature->add_Annotation('Parent',Bio::Annotation::SimpleValue->new( -value => $_ ));
 670     }
 671   }
 672   return $feature;
 673 }
 674
 675
 676 #object method that, given two (one-based) coordinates on a sequence
 677 #and its length, returns the reverse complement of the coordinates in
 678 #a two-element list
 679 sub _reverse_complement_coords {
 680   my ($s,$e,$l) = @_;
 681   return ( $l-$e+1, $l-$s+1 );
 682 }
 683
 684 #make a gff3-compliant feature start, end, and strand
 685 #from a gamexml-style start and end that might be backwards
 686 sub _start_end_strand(@) {
 687   my ($start,$end) = @_;
 688   if($start > $end) {
 689     return ($end,$start,-1);
 690   } else {
 691     return ($start,$end,1);
 692   }
 693 }
 694
 695 #extract all the (possibly fractional) numbers from a line of text,
 696 #return them as a list
 697 sub _parse_numbers {
 698   my ($line) = @_;
 699   my @nums;
 700   while( $line =~ /(\d+(?:\.\d+)?)/g ) {
 701     push @nums, $1;
 702   }
 703   return @nums;
 704 }
 705
 706 # lookup_alignment
 707 # get_alignment
 708 # purge_alignments
 709 sub _index_alignment {
 710   my ($self,$key,$feature) = @_;
 711 #  warn "indexing with '$key'\n";
 712   $self->{alignment_features}{$key} = $feature;
 713 }
 714 sub _purge_alignment_index {
 715   shift->{alignment_features} = {};
 716 }
 717 sub _get_indexed_alignment {
 718   my ($self,$key) = @_;
 719   return $self->{alignment_features}{$key};
 720 }
 721
 722 #chop the strand (+ or -) off of a seq name and return it as 1 for +
 723 #or -1 for - note that this has the side effect of modifying the value
 724 #of the thing that's passed to it
 725 sub _chop_strand {
 726   my ($ident) = @_;
 727   my $strand = chop $$ident;
 728   if( $strand eq '+' ) {
 729     return 1;
 730   }
 731   elsif( $strand eq '-' ) {
 732     return -1;
 733   }
 734   die "Invalid strand '$strand' at end of identifier '$$ident$strand'";
 735 }
 736
 737 ###
 738 1;# do not remove
 739 ###
 740