a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / Bio / FeatureIO / fgenesh.pm
blob34d954244e882fd7b1aa9e48dce20a25f690e4a2
1 =pod
3 =head1 NAME
5 Bio::FeatureIO::fgenesh - FeatureIO parser for FGENESH reports
7 =head1 SYNOPSIS
9 my $feature_in = Bio::FeatureIO->new(-format => 'fgenesh',
10 -file => 'myfile.fgenesh',
13 while(my $feat = $feature_in->next_feature) {
14 #do something with the feature
18 =head1 DESCRIPTION
20 Parses FGENESH reports into L<Bio::SeqFeature::Annotated> objects,
21 with subfeatures. The feature hierarchy is:
23 mRNA
24 |- five_prime_UTR
25 |- CDS
26 |- ...
27 `- three_prime_UTR
28 polyA_signal_site
30 One or both of the UTRs and/or polyA_signal_site might be missing.
31 Sometimes FGENESH does not provide them.
33 =head1 FEEDBACK
35 =head2 Mailing Lists
37 User feedback is an integral part of the evolution of this and other
38 Bioperl modules. Send your comments and suggestions preferably to
39 the Bioperl mailing list. Your participation is much appreciated.
41 bioperl-l@bioperl.org - General discussion
42 http://bioperl.org/MailList.shtml - About the mailing lists
44 =head2 Reporting Bugs
46 Report bugs to the Bioperl bug tracking system to help us keep track
47 of the bugs and their resolution. Bug reports can be submitted via
48 the web:
50 http://bugzilla.bioperl.org/
52 =head1 AUTHOR
54 Robert Buels, rmb32@cornell.edu
56 =head1 APPENDIX
58 The rest of the documentation details each of the object methods.
59 Internal methods are usually preceded with a _
61 =cut
63 # Let the code begin...
65 package Bio::FeatureIO::fgenesh;
66 use strict;
68 use base qw(Bio::FeatureIO);
70 use Bio::SeqFeature::Annotated;
71 use Bio::Annotation::Target;
73 =head2 new()
75 returns a new parser FeatureIO::fgenesh object
77 =cut
79 # sub _initialize {
80 # my($self,%arg) = @_;
82 # $self->SUPER::_initialize(%arg);
84 # #init buffers
85 # $self->{alignment_buffer} = [];
86 # $self->{feature_buffer} = [];
88 # #set defaults
89 # $arg{-mode} ||= 'pgls';
90 # $arg{-attach_alignments} = 0 unless defined $arg{-attach_alignments};
92 # #set options
93 # $self->mode($arg{-mode});
94 # $self->attach_alignments($arg{-attach_alignments});
97 # my $alignment_buffer = $self->mode eq 'both_merged' || $self->mode eq 'alignments_merged'
98 # ? $self->{feature_buffer}
99 # : $self->{alignment_buffer};
100 # my @spliced_alignment_twig = $self->mode eq 'pgls'
101 # ? ()
102 # : (spliced_alignment => sub {
103 # push @$alignment_buffer,
104 # $self->_parse_alignment(@_);
105 # shift->purge;
106 # });
107 # my @pgl_twig = $self->mode eq 'alignments' || $self->mode eq 'alignments_merged'
108 # ? ()
109 # : (predicted_gene_location => sub {
110 # push @{$self->{feature_buffer}},
111 # $self->_parse_pgl(@_);
112 # shift->purge;
113 # });
115 # #now parse the entire input file, buffering pgls and alignments
116 # XML::Twig->new( twig_roots =>
118 # @spliced_alignment_twig,
119 # @pgl_twig,
121 # )->parse($self->_fh);
124 =head2 next_feature()
126 Usage : my $feature = $featureio->next_feature();
127 Function: returns the next available gene prediction.
128 Predictions will be returned in the same order as they
129 appear in the file.
130 Returns : a predicted gene feature, of class
131 L<Bio::SeqFeature::Annotated>
132 Args : none
134 =cut
136 sub next_feature {
137 my ($self) = @_;
139 #return a buffered feature if we have one
140 if($self->{feature_buffer} && @{$self->{feature_buffer}}) {
141 return shift @{$self->{feature_buffer}};
144 #otherwise, continue parsing
145 while(my $line = $self->_readline) {
146 if( $self->_match( $line => 'is_prediction_line' )) {
147 #must be a prediction line
148 $self->_pushback($line);
149 my ($mrna,$polya) = $self->_parse_prediction;
150 push @{$self->{feature_buffer}},$polya if $polya;
151 return $mrna;
153 elsif( my ($seqname) = $self->_match( $line => 'capture_seq_name' )) {
154 $self->{seqname} = $seqname;
155 #save the sequence name for further reference and continue
157 elsif( my ($pname) = $self->_match( $line => 'capture_params_name' )) {
158 $self->{paramsname} = $pname;
160 elsif( $self->_match( $line => 'is_start_proteins' )) {
161 #we end our parsing when the predicted protein section begins
163 #gotta push it back so we'll hit it again if the user calls
164 #next_feature again
165 $self->_pushback($line);
166 return;
171 #central function for keeping all the regexps used in this parser,
172 #returns the result of matching the given text against the pattern
173 #with the given name
174 sub _match {
175 my ($self,$text,$patname) = @_;
176 my %pats = ( is_prediction_line => qr/^\s*\d+\s*[+-]/,
177 capture_seq_name => qr/^\s*Seq(?:uence)?\s*name\s*:\s*(\S+)/i,
178 capture_params_name=> qr/^\s*FGENESH.+prediction.+in (\S+).+DNA/i,
179 is_start_proteins => qr/^\s*Predicted protein(s):/i,
181 $pats{$patname} or $self->throw("unknown pattern name '$patname'");
182 return $text =~ $pats{$patname};
185 sub _parse_prediction {
186 my ($self) = @_;
188 #store the lines
189 my @features;
190 my $polya; #< store the polya separately if present
192 my $line;
194 while($line = $self->_readline and $self->_match($line => 'is_prediction_line')) {
195 # warn "parsing line: $line";
196 my @data = split /\s+/,$line;
197 shift @data until $data[0]; #< get rid of leading whitespace
199 # warn "data is: ".join(',',@data)."\n";
201 #### now parse the prediction line in earnest
203 my ($prediction_number,$strand) = splice @data,0,2;
205 #next thing might be either an exon number or the type
206 my $exon_num = $data[0] =~ /^\d+$/ ? shift(@data) : undef;
208 my $type = shift @data;
209 my $start = shift @data;
211 my ($end,$score) = do {
212 if($data[0] eq '-') {
213 shift @data;
214 splice @data,0,2
215 } else {
216 $start,shift @data #< has no end, thus end is same as start
220 # my $orf_start = shift @data;
222 # my $orf_end = shift @data;
223 # $orf_end = shift @data if $orf_end eq '-';
225 ### now make a feature out of it
226 my %type_map = (
227 TSS => 'five_prime_UTR',
228 PolA => 'polyA_signal_sequence',
229 CDSi => 'CDS',
230 CDSf => 'CDS',
231 CDSl => 'CDS',
232 CDSo => 'CDS',
234 $type = $type_map{$type} || do {$self->warn("could not convert fgenesh type '$type' to a SOFA type");
235 'region'}; #< default to 'region' and warn about it
236 my $pname = $self->{paramsname} ? "_$self->{paramsname}" : '';
237 push @features, Bio::SeqFeature::Annotated->new
238 ( -start => $start, -end => $end, -strand => $strand,
239 -score => $score,
240 -type => $type,
241 -seq_id => defined $self->{seqname} ? $self->{seqname}
242 : $self->throw("parse error, no sequence line found before predictions"),
243 -source => "FGENESH$pname",
244 -annots => { ID => "FGENESH$pname-$self->{seqname}-gene_$prediction_number-$type".($exon_num ? "-$exon_num" : '')},
248 ### now go back and correct/add some things
249 #make an array with features ordered from transcription start to end
250 our $strand = $features[0]->strand;
251 @features = reverse @features if $strand == -1;
253 #make subs to operate on a feature relative to the direction of transcription
254 sub se { #< return start/end string, relative to dir of transcription
255 my ($se) = @_;
256 return $strand == -1 ? ( $se eq 'start' ? 'end' : 'start' ) : $se;
258 sub fse { #< access feature start/end relative to dir of transcription
259 my ($feat,$se,$val) = @_;
260 $se = se($se);
261 return $feat->$se($val);
263 sub add { #< sub to add $a and $b, reversing +/- if on reverse strand
264 my ($a,$b) = @_;
265 return $strand == -1
266 ? $a - $b
267 : $a + $b;
270 #now fix the utrs, operating relative to the dir of transcription
272 #set the end of the five_prime_UTR to be next to the beginning of
273 #the first CDS
274 if($features[0]->type->name eq 'five_prime_UTR') {
275 fse($features[0],'end', add(fse($features[1],'start'),-1));
277 sub fts {
278 my ($feat) = @_;
279 join(' ',map {"'$_'"} $feat->type->name,$feat->start,$feat->end)."\n";
281 #add 5 to the polyA signal site coordinate if present so that it
282 #encompasses the whole poly-A signal sequence
283 if($features[-1]->type->name eq 'polyA_signal_sequence') {
284 $polya = pop @features;
286 #reparent the polya feature to be part of the gene, not the mRNA,
287 #in order to conform with SO and SOFA
288 #TODO
291 fse($polya,'end', add(fse($polya,'start'),5));
292 #make a new three_prime_UTR feature from the end of the last CDS
293 #encompassing the polyA signal site
294 my $utrid = $polya->get_Annotations('ID')->value;
295 $utrid =~ s/gene_(\d+)-.+$/gene_$1-three_prime_UTR/;
296 my $utr3 = Bio::SeqFeature::Annotated->new( -feature => $polya,
297 -type => 'three_prime_UTR',
298 -annots => { ID => $utrid },
300 # $utr3->seq_id('foobar');
301 fse($utr3,'start', add(fse($features[-1],'end'),1));
302 # warn "adding utr3 ".fts($utr3);
303 push @features,$utr3;
306 #now reverse again to put it back in the original order
307 @features = reverse @features if $strand == -1;
309 # warn "\nfeatures are now:\n",map {fts($_)} @features;# if $strand == -1;
311 #make all of these subfeatures of a single mRNA feature, with Parent
312 #annotations to match
313 my $mrna_id = $features[0]->get_Annotations('ID')->value;
314 $mrna_id =~ s/gene_(\d+)-.+$/gene_$1-mRNA/;
315 my $mrna = Bio::SeqFeature::Annotated->new( -feature => $features[0],
316 -end => $features[-1]->end,
317 -type => 'mRNA',
318 -score => '.',
319 -annots => { ID => $mrna_id},
321 foreach(@features) {
322 $_->add_Annotation(Parent => Bio::Annotation::SimpleValue->new(-value => $mrna_id));
323 $mrna->add_SeqFeature($_);
326 return $mrna,$polya;
329 =head2 write_feature()
331 Not implemented.
333 =cut
335 sub write_feature {
336 shift->throw_not_implemented;
340 1;# do not remove