5 Bio::FeatureIO::fgenesh - FeatureIO parser for FGENESH reports
9 my $feature_in = Bio::FeatureIO->new(-format => 'fgenesh',
10 -file => 'myfile.fgenesh',
13 while(my $feat = $feature_in->next_feature) {
14 #do something with the feature
20 Parses FGENESH reports into L<Bio::SeqFeature::Annotated> objects,
21 with subfeatures. The feature hierarchy is:
30 One or both of the UTRs and/or polyA_signal_site might be missing.
31 Sometimes FGENESH does not provide them.
37 User feedback is an integral part of the evolution of this and other
38 Bioperl modules. Send your comments and suggestions preferably to
39 the Bioperl mailing list. Your participation is much appreciated.
41 bioperl-l@bioperl.org - General discussion
42 http://bioperl.org/MailList.shtml - About the mailing lists
46 Report bugs to the Bioperl bug tracking system to help us keep track
47 of the bugs and their resolution. Bug reports can be submitted via
50 http://bugzilla.bioperl.org/
54 Robert Buels, rmb32@cornell.edu
58 The rest of the documentation details each of the object methods.
59 Internal methods are usually preceded with a _
63 # Let the code begin...
65 package Bio
::FeatureIO
::fgenesh
;
68 use base
qw(Bio::FeatureIO);
70 use Bio
::SeqFeature
::Annotated
;
71 use Bio
::Annotation
::Target
;
75 returns a new parser FeatureIO::fgenesh object
80 # my($self,%arg) = @_;
82 # $self->SUPER::_initialize(%arg);
85 # $self->{alignment_buffer} = [];
86 # $self->{feature_buffer} = [];
89 # $arg{-mode} ||= 'pgls';
90 # $arg{-attach_alignments} = 0 unless defined $arg{-attach_alignments};
93 # $self->mode($arg{-mode});
94 # $self->attach_alignments($arg{-attach_alignments});
97 # my $alignment_buffer = $self->mode eq 'both_merged' || $self->mode eq 'alignments_merged'
98 # ? $self->{feature_buffer}
99 # : $self->{alignment_buffer};
100 # my @spliced_alignment_twig = $self->mode eq 'pgls'
102 # : (spliced_alignment => sub {
103 # push @$alignment_buffer,
104 # $self->_parse_alignment(@_);
107 # my @pgl_twig = $self->mode eq 'alignments' || $self->mode eq 'alignments_merged'
109 # : (predicted_gene_location => sub {
110 # push @{$self->{feature_buffer}},
111 # $self->_parse_pgl(@_);
115 # #now parse the entire input file, buffering pgls and alignments
116 # XML::Twig->new( twig_roots =>
118 # @spliced_alignment_twig,
121 # )->parse($self->_fh);
124 =head2 next_feature()
126 Usage : my $feature = $featureio->next_feature();
127 Function: returns the next available gene prediction.
128 Predictions will be returned in the same order as they
130 Returns : a predicted gene feature, of class
131 L<Bio::SeqFeature::Annotated>
139 #return a buffered feature if we have one
140 if($self->{feature_buffer
} && @
{$self->{feature_buffer
}}) {
141 return shift @
{$self->{feature_buffer
}};
144 #otherwise, continue parsing
145 while(my $line = $self->_readline) {
146 if( $self->_match( $line => 'is_prediction_line' )) {
147 #must be a prediction line
148 $self->_pushback($line);
149 my ($mrna,$polya) = $self->_parse_prediction;
150 push @
{$self->{feature_buffer
}},$polya if $polya;
153 elsif( my ($seqname) = $self->_match( $line => 'capture_seq_name' )) {
154 $self->{seqname
} = $seqname;
155 #save the sequence name for further reference and continue
157 elsif( my ($pname) = $self->_match( $line => 'capture_params_name' )) {
158 $self->{paramsname
} = $pname;
160 elsif( $self->_match( $line => 'is_start_proteins' )) {
161 #we end our parsing when the predicted protein section begins
163 #gotta push it back so we'll hit it again if the user calls
165 $self->_pushback($line);
171 #central function for keeping all the regexps used in this parser,
172 #returns the result of matching the given text against the pattern
175 my ($self,$text,$patname) = @_;
176 my %pats = ( is_prediction_line
=> qr/^\s*\d+\s*[+-]/,
177 capture_seq_name
=> qr/^\s*Seq(?:uence)?\s*name\s*:\s*(\S+)/i,
178 capture_params_name
=> qr/^\s*FGENESH.+prediction.+in (\S+).+DNA/i,
179 is_start_proteins
=> qr/^\s*Predicted protein(s):/i,
181 $pats{$patname} or $self->throw("unknown pattern name '$patname'");
182 return $text =~ $pats{$patname};
185 sub _parse_prediction
{
190 my $polya; #< store the polya separately if present
194 while($line = $self->_readline and $self->_match($line => 'is_prediction_line')) {
195 # warn "parsing line: $line";
196 my @data = split /\s+/,$line;
197 shift @data until $data[0]; #< get rid of leading whitespace
199 # warn "data is: ".join(',',@data)."\n";
201 #### now parse the prediction line in earnest
203 my ($prediction_number,$strand) = splice @data,0,2;
205 #next thing might be either an exon number or the type
206 my $exon_num = $data[0] =~ /^\d+$/ ?
shift(@data) : undef;
208 my $type = shift @data;
209 my $start = shift @data;
211 my ($end,$score) = do {
212 if($data[0] eq '-') {
216 $start,shift @data #< has no end, thus end is same as start
220 # my $orf_start = shift @data;
222 # my $orf_end = shift @data;
223 # $orf_end = shift @data if $orf_end eq '-';
225 ### now make a feature out of it
227 TSS
=> 'five_prime_UTR',
228 PolA
=> 'polyA_signal_sequence',
234 $type = $type_map{$type} || do {$self->warn("could not convert fgenesh type '$type' to a SOFA type");
235 'region'}; #< default to 'region' and warn about it
236 my $pname = $self->{paramsname
} ?
"_$self->{paramsname}" : '';
237 push @features, Bio
::SeqFeature
::Annotated
->new
238 ( -start
=> $start, -end
=> $end, -strand
=> $strand,
241 -seq_id
=> defined $self->{seqname
} ?
$self->{seqname
}
242 : $self->throw("parse error, no sequence line found before predictions"),
243 -source
=> "FGENESH$pname",
244 -annots
=> { ID
=> "FGENESH$pname-$self->{seqname}-gene_$prediction_number-$type".($exon_num ?
"-$exon_num" : '')},
248 ### now go back and correct/add some things
249 #make an array with features ordered from transcription start to end
250 our $strand = $features[0]->strand;
251 @features = reverse @features if $strand == -1;
253 #make subs to operate on a feature relative to the direction of transcription
254 sub se
{ #< return start/end string, relative to dir of transcription
256 return $strand == -1 ?
( $se eq 'start' ?
'end' : 'start' ) : $se;
258 sub fse
{ #< access feature start/end relative to dir of transcription
259 my ($feat,$se,$val) = @_;
261 return $feat->$se($val);
263 sub add
{ #< sub to add $a and $b, reversing +/- if on reverse strand
270 #now fix the utrs, operating relative to the dir of transcription
272 #set the end of the five_prime_UTR to be next to the beginning of
274 if($features[0]->type->name eq 'five_prime_UTR') {
275 fse
($features[0],'end', add
(fse
($features[1],'start'),-1));
279 join(' ',map {"'$_'"} $feat->type->name,$feat->start,$feat->end)."\n";
281 #add 5 to the polyA signal site coordinate if present so that it
282 #encompasses the whole poly-A signal sequence
283 if($features[-1]->type->name eq 'polyA_signal_sequence') {
284 $polya = pop @features;
286 #reparent the polya feature to be part of the gene, not the mRNA,
287 #in order to conform with SO and SOFA
291 fse
($polya,'end', add
(fse
($polya,'start'),5));
292 #make a new three_prime_UTR feature from the end of the last CDS
293 #encompassing the polyA signal site
294 my $utrid = $polya->get_Annotations('ID')->value;
295 $utrid =~ s/gene_(\d+)-.+$/gene_$1-three_prime_UTR/;
296 my $utr3 = Bio
::SeqFeature
::Annotated
->new( -feature
=> $polya,
297 -type
=> 'three_prime_UTR',
298 -annots
=> { ID
=> $utrid },
300 # $utr3->seq_id('foobar');
301 fse
($utr3,'start', add
(fse
($features[-1],'end'),1));
302 # warn "adding utr3 ".fts($utr3);
303 push @features,$utr3;
306 #now reverse again to put it back in the original order
307 @features = reverse @features if $strand == -1;
309 # warn "\nfeatures are now:\n",map {fts($_)} @features;# if $strand == -1;
311 #make all of these subfeatures of a single mRNA feature, with Parent
312 #annotations to match
313 my $mrna_id = $features[0]->get_Annotations('ID')->value;
314 $mrna_id =~ s/gene_(\d+)-.+$/gene_$1-mRNA/;
315 my $mrna = Bio
::SeqFeature
::Annotated
->new( -feature
=> $features[0],
316 -end
=> $features[-1]->end,
319 -annots
=> { ID
=> $mrna_id},
322 $_->add_Annotation(Parent
=> Bio
::Annotation
::SimpleValue
->new(-value
=> $mrna_id));
323 $mrna->add_SeqFeature($_);
329 =head2 write_feature()
336 shift->throw_not_implemented;