lib/Bio/FeatureIO/fgenesh.pm

   1 =pod
   2
   3 =head1 NAME
   4
   5 Bio::FeatureIO::fgenesh - FeatureIO parser for FGENESH reports
   6
   7 =head1 SYNOPSIS
   8
   9   my $feature_in = Bio::FeatureIO->new(-format => 'fgenesh',
  10                                        -file   => 'myfile.fgenesh',
  11                                       );
  12
  13   while(my $feat = $feature_in->next_feature) {
  14     #do something with the feature
  15   }
  16
  17
  18 =head1 DESCRIPTION
  19
  20 Parses FGENESH reports into L<Bio::SeqFeature::Annotated> objects,
  21 with subfeatures.  The feature hierarchy is:
  22
  23       mRNA
  24         |- five_prime_UTR
  25         |- CDS
  26         |- ...
  27         `- three_prime_UTR
  28       polyA_signal_site
  29
  30 One or both of the UTRs and/or polyA_signal_site might be missing.
  31 Sometimes FGENESH does not provide them.
  32
  33 =head1 FEEDBACK
  34
  35 =head2 Mailing Lists
  36
  37 User feedback is an integral part of the evolution of this and other
  38 Bioperl modules. Send your comments and suggestions preferably to
  39 the Bioperl mailing list.  Your participation is much appreciated.
  40
  41   bioperl-l@bioperl.org              - General discussion
  42   http://bioperl.org/MailList.shtml  - About the mailing lists
  43
  44 =head2 Reporting Bugs
  45
  46 Report bugs to the Bioperl bug tracking system to help us keep track
  47 of the bugs and their resolution. Bug reports can be submitted via
  48 the web:
  49
  50   http://bugzilla.bioperl.org/
  51
  52 =head1 AUTHOR
  53
  54  Robert Buels, rmb32@cornell.edu
  55
  56 =head1 APPENDIX
  57
  58 The rest of the documentation details each of the object methods.
  59 Internal methods are usually preceded with a _
  60
  61 =cut
  62
  63 # Let the code begin...
  64
  65 package Bio::FeatureIO::fgenesh;
  66 use strict;
  67
  68 use base qw(Bio::FeatureIO);
  69
  70 use Bio::SeqFeature::Annotated;
  71 use Bio::Annotation::Target;
  72
  73 =head2 new()
  74
  75   returns a new parser FeatureIO::fgenesh object
  76
  77 =cut
  78
  79 # sub _initialize {
  80 #   my($self,%arg) = @_;
  81
  82 #   $self->SUPER::_initialize(%arg);
  83
  84 #   #init buffers
  85 #   $self->{alignment_buffer} = [];
  86 #   $self->{feature_buffer} = [];
  87
  88 #   #set defaults
  89 #   $arg{-mode} ||= 'pgls';
  90 #   $arg{-attach_alignments} = 0 unless defined $arg{-attach_alignments};
  91
  92 #   #set options
  93 #   $self->mode($arg{-mode});
  94 #   $self->attach_alignments($arg{-attach_alignments});
  95
  96
  97 #   my $alignment_buffer = $self->mode eq 'both_merged' || $self->mode eq 'alignments_merged'
  98 #     ? $self->{feature_buffer}
  99 #     : $self->{alignment_buffer};
 100 #   my @spliced_alignment_twig = $self->mode eq 'pgls'
 101 #     ? ()
 102 #     : (spliced_alignment => sub {
 103 #        push @$alignment_buffer,
 104 #          $self->_parse_alignment(@_);
 105 #        shift->purge;
 106 #        });
 107 #   my @pgl_twig  = $self->mode eq 'alignments' || $self->mode eq 'alignments_merged'
 108 #     ? ()
 109 #     : (predicted_gene_location => sub {
 110 #        push @{$self->{feature_buffer}},
 111 #          $self->_parse_pgl(@_);
 112 #        shift->purge;
 113 #        });
 114
 115 #   #now parse the entire input file, buffering pgls and alignments
 116 #   XML::Twig->new( twig_roots =>
 117 #                 {
 118 #                  @spliced_alignment_twig,
 119 #                  @pgl_twig,
 120 #                 }
 121 #               )->parse($self->_fh);
 122 # }
 123
 124 =head2 next_feature()
 125
 126  Usage   : my $feature = $featureio->next_feature();
 127  Function: returns the next available gene prediction.
 128            Predictions will be returned in the same order as they
 129            appear in the file.
 130  Returns : a predicted gene feature, of class
 131            L<Bio::SeqFeature::Annotated>
 132  Args    : none
 133
 134 =cut
 135
 136 sub next_feature {
 137   my ($self) = @_;
 138
 139   #return a buffered feature if we have one
 140   if($self->{feature_buffer} && @{$self->{feature_buffer}}) {
 141     return shift @{$self->{feature_buffer}};
 142   }
 143
 144   #otherwise, continue parsing
 145   while(my $line = $self->_readline) {
 146     if( $self->_match( $line => 'is_prediction_line' )) {
 147       #must be a prediction line
 148       $self->_pushback($line);
 149       my ($mrna,$polya) = $self->_parse_prediction;
 150       push @{$self->{feature_buffer}},$polya if $polya;
 151       return $mrna;
 152     }
 153     elsif( my ($seqname) = $self->_match( $line => 'capture_seq_name' )) {
 154       $self->{seqname} = $seqname;
 155       #save the sequence name for further reference and continue
 156     }
 157     elsif( my ($pname) = $self->_match( $line => 'capture_params_name' )) {
 158       $self->{paramsname} = $pname;
 159     }
 160     elsif( $self->_match( $line => 'is_start_proteins' )) {
 161       #we end our parsing when the predicted protein section begins
 162
 163       #gotta push it back so we'll hit it again if the user calls
 164       #next_feature again
 165       $self->_pushback($line);
 166       return;
 167     }
 168   }
 169 }
 170
 171 #central function for keeping all the regexps used in this parser,
 172 #returns the result of matching the given text against the pattern
 173 #with the given name
 174 sub _match {
 175   my ($self,$text,$patname) = @_;
 176   my %pats = ( is_prediction_line => qr/^\s*\d+\s*[+-]/,
 177                capture_seq_name   => qr/^\s*Seq(?:uence)?\s*name\s*:\s*(\S+)/i,
 178                capture_params_name=> qr/^\s*FGENESH.+prediction.+in (\S+).+DNA/i,
 179                is_start_proteins  => qr/^\s*Predicted protein(s):/i,
 180              );
 181   $pats{$patname} or $self->throw("unknown pattern name '$patname'");
 182   return $text =~ $pats{$patname};
 183 }
 184
 185 sub _parse_prediction {
 186   my ($self) = @_;
 187
 188   #store the lines
 189   my @features;
 190   my $polya; #< store the polya separately if present
 191
 192   my $line;
 193
 194   while($line = $self->_readline and $self->_match($line => 'is_prediction_line')) {
 195 #    warn "parsing line: $line";
 196     my @data = split /\s+/,$line;
 197     shift @data until $data[0]; #< get rid of leading whitespace
 198
 199 #    warn "data is: ".join(',',@data)."\n";
 200
 201     ####  now parse the prediction line in earnest
 202
 203     my ($prediction_number,$strand) = splice @data,0,2;
 204
 205     #next thing might be either an exon number or the type
 206     my $exon_num  = $data[0] =~ /^\d+$/ ? shift(@data) : undef;
 207
 208     my $type      = shift @data;
 209     my $start     = shift @data;
 210
 211     my ($end,$score) = do {
 212       if($data[0] eq '-') {
 213         shift @data;
 214         splice @data,0,2
 215       } else {
 216         $start,shift @data #< has no end, thus end is same as start
 217       }
 218     };
 219
 220 #     my $orf_start = shift @data;
 221
 222 #     my $orf_end   = shift @data;
 223 #        $orf_end   = shift @data if $orf_end eq '-';
 224
 225     ### now make a feature out of it
 226     my %type_map = (
 227                     TSS  => 'five_prime_UTR',
 228                     PolA => 'polyA_signal_sequence',
 229                     CDSi => 'CDS',
 230                     CDSf => 'CDS',
 231                     CDSl => 'CDS',
 232                     CDSo => 'CDS',
 233                    );
 234     $type = $type_map{$type} || do {$self->warn("could not convert fgenesh type '$type' to a SOFA type");
 235                                        'region'}; #< default to 'region' and warn about it
 236     my $pname = $self->{paramsname} ? "_$self->{paramsname}" : '';
 237     push @features, Bio::SeqFeature::Annotated->new
 238       ( -start  => $start, -end => $end, -strand => $strand,
 239         -score  => $score,
 240         -type   => $type,
 241         -seq_id => defined $self->{seqname} ? $self->{seqname}
 242                                             : $self->throw("parse error, no sequence line found before predictions"),
 243         -source => "FGENESH$pname",
 244         -annots => { ID => "FGENESH$pname-$self->{seqname}-gene_$prediction_number-$type".($exon_num ? "-$exon_num" : '')},
 245       );
 246   }
 247
 248   ### now go back and correct/add some things
 249   #make an array with features ordered from transcription start to end
 250   our $strand = $features[0]->strand;
 251   @features = reverse @features if $strand == -1;
 252
 253   #make subs to operate on a feature relative to the direction of transcription
 254     sub se { #< return start/end string, relative to dir of transcription
 255       my ($se) = @_;
 256       return $strand == -1 ? ( $se eq 'start' ? 'end' : 'start' ) : $se;
 257     }
 258     sub fse {   #< access feature start/end relative to dir of transcription
 259       my ($feat,$se,$val) = @_;
 260       $se = se($se);
 261       return $feat->$se($val);
 262     }
 263     sub add { #< sub to add $a and $b, reversing +/- if on reverse strand
 264       my ($a,$b) = @_;
 265       return $strand == -1
 266         ? $a - $b
 267         : $a + $b;
 268     }
 269
 270   #now fix the utrs, operating relative to the dir of transcription
 271
 272   #set the end of the five_prime_UTR to be next to the beginning of
 273   #the first CDS
 274   if($features[0]->type->name eq 'five_prime_UTR') {
 275     fse($features[0],'end', add(fse($features[1],'start'),-1));
 276   }
 277   sub fts {
 278     my ($feat) = @_;
 279     join(' ',map {"'$_'"} $feat->type->name,$feat->start,$feat->end)."\n";
 280   }
 281   #add 5 to the polyA signal site coordinate if present so that it
 282   #encompasses the whole poly-A signal sequence
 283   if($features[-1]->type->name eq 'polyA_signal_sequence') {
 284     $polya = pop @features;
 285
 286     #reparent the polya feature to be part of the gene, not the mRNA,
 287     #in order to conform with SO and SOFA
 288     #TODO
 289
 290
 291     fse($polya,'end',  add(fse($polya,'start'),5));
 292     #make a new three_prime_UTR feature from the end of the last CDS
 293     #encompassing the polyA signal site
 294     my $utrid = $polya->get_Annotations('ID')->value;
 295     $utrid =~ s/gene_(\d+)-.+$/gene_$1-three_prime_UTR/;
 296     my $utr3 = Bio::SeqFeature::Annotated->new( -feature => $polya,
 297                                                 -type => 'three_prime_UTR',
 298                                                 -annots => { ID => $utrid },
 299                                               );
 300 #    $utr3->seq_id('foobar');
 301     fse($utr3,'start', add(fse($features[-1],'end'),1));
 302 #    warn "adding utr3 ".fts($utr3);
 303     push @features,$utr3;
 304   }
 305
 306   #now reverse again to put it back in the original order
 307   @features = reverse @features if $strand == -1;
 308
 309 #  warn "\nfeatures are now:\n",map {fts($_)} @features;# if $strand == -1;
 310
 311   #make all of these subfeatures of a single mRNA feature, with Parent
 312   #annotations to match
 313   my $mrna_id = $features[0]->get_Annotations('ID')->value;
 314   $mrna_id =~ s/gene_(\d+)-.+$/gene_$1-mRNA/;
 315   my $mrna = Bio::SeqFeature::Annotated->new( -feature => $features[0],
 316                                               -end => $features[-1]->end,
 317                                               -type => 'mRNA',
 318                                               -score => '.',
 319                                               -annots => { ID => $mrna_id},
 320                                             );
 321   foreach(@features) {
 322     $_->add_Annotation(Parent => Bio::Annotation::SimpleValue->new(-value => $mrna_id));
 323     $mrna->add_SeqFeature($_);
 324   }
 325
 326   return $mrna,$polya;
 327 }
 328
 329 =head2 write_feature()
 330
 331 Not implemented.
 332
 333 =cut
 334
 335 sub write_feature {
 336   shift->throw_not_implemented;
 337 }
 338
 339 ###
 340 1;# do not remove
 341 ###
 342