2 # Please direct questions and support issues to <bioperl-l@bioperl.org>
4 # Cared for by Keith James
6 # Copyright Genome Research Ltd.
8 # You may distribute this module under the same terms as Perl itself
10 # POD documentation - main docs before the code
17 Bio::Tools::Geneid - Results of one geneid run
21 use Bio::Tools::Geneid;
22 my $gid = Bio::Tools::Geneid(-file => "geneid.out");
24 while (my $gene = $gid->next_prediction)
26 my @transcripts = $gene->transcripts;
27 foreach my $t (@transcripts)
29 my @exons = $t->exons;
30 foreach my $e (@exons)
32 printf("Exon %d..%d\n", $e->start, $e->end);
39 This is the parser for the output of geneid by Enrique Blanco and
40 Roderic GuigE<243> (IMIM-UPF). See http://www1.imim.es/software/geneid. It
41 relies on native geneid output format internally and will work with
42 geneid versions 1.0 and 1.1. Currently this module supports only the
43 default mode of operation which is to predict exons and assemble an
44 optimal gene prediction.
46 It takes either a file handle or a file name and returns a
47 Bio::SeqFeature::Gene::GeneStructure object.
53 User feedback is an integral part of the evolution of this and other
54 Bioperl modules. Send your comments and suggestions preferably to one
55 of the Bioperl mailing lists. Your participation is much appreciated.
57 bioperl-l@bioperl.org - General discussion
58 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
62 Please direct usage questions or support issues to the mailing list:
64 I<bioperl-l@bioperl.org>
66 rather than to the module maintainer directly. Many experienced and
67 reponsive experts will be able look at the problem and quickly
68 address it. Please include a thorough description of the problem
69 with code and data examples if at all possible.
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 the bugs and their resolution. Bug reports can be submitted via the
77 https://github.com/bioperl/bioperl-live/issues
79 =head1 AUTHOR - Keith James
81 Email: kdj@sanger.ac.uk
85 The rest of the documentation details each of the object methods.
86 Internal methods are usually preceded with a _
90 package Bio
::Tools
::Geneid
;
92 use vars
qw($SOURCE_TAG);
95 use Bio::Tools::AnalysisResult;
96 use Bio::SeqFeature::Generic;
97 use Bio::SeqFeature::Gene::Exon;
98 use Bio::SeqFeature::Gene::Transcript;
99 use Bio::SeqFeature::Gene::GeneStructure;
101 use base qw(Bio::Root::Root Bio::Root::IO);
102 $SOURCE_TAG = 'geneid';
107 Usage : $obj->new(-file = "<geneid.out");
108 $obj->new(-fh => \*GI);
109 Function: Constructor for geneid wrapper. Takes either a file
111 Returns : L<Bio::Tools::Geneid>
117 my($class, @args) = @_;
118 my $self = $class->SUPER::new
(@args);
119 $self->_initialize_io(@args);
123 =head2 next_prediction
125 Title : next_prediction
126 Usage : while($gene = $geneid->next_prediction)
130 Function: Returns the gene structure prediction of the geneid result
131 file. Call this method repeatedly until FALSE is returned.
132 Returns : A Bio::SeqFeature::Gene::GeneStructure object
141 my ($gene, $transcript, $current_gene_id);
142 my $transcript_score = 0;
144 my ($gene_id, $exon_type, $exon_start, $exon_end, $exon_score,
145 $exon_strand, $start_phase, $end_phase, $start_sig_score,
146 $end_sig_score, $coding_pot_score, $homol_score);
148 while (defined($_ = $self->_readline))
155 # We have a choice of geneid, gff or XML formats. The native
156 # geneid format has more information than gff. However, we
157 # then need to perform the hack of extracting the sequence ID
158 # from the header of the embedded Fasta file which comes after
159 # the exon data, as it is not stored elsewhere. Ack.
160 # the new version of geneID includes the sequence ID in a slightly
161 # different format and a new "or" statement was added below
162 # also removed "unless defined $self->_target_id;" inorder to continue
163 # generating new sequence IDs.
165 if (/^>(\S+)\|GeneId/ or /^# Sequence (\S+)/)
168 $self->_target_id($target_id);
172 next unless (/(Single|First|Internal|Terminal)/);
174 my @fields = split(/\s+/, $_);
176 # Grab gene_id from eol first as there are issues with
177 # inconsistent whitespace in the AA coords field
178 $gene_id = pop @fields;
180 ($exon_type, $exon_start, $exon_end, $exon_score,
181 $exon_strand, $start_phase, $end_phase, $start_sig_score,
182 $end_sig_score, $coding_pot_score, $homol_score) = @fields[0..10];
184 if (! defined $current_gene_id)
186 # Starting the requested prediction
187 $current_gene_id = $gene_id;
188 $transcript_score = $exon_score;
190 $gene = Bio
::SeqFeature
::Gene
::GeneStructure
->new(-source
=>
192 $transcript = Bio
::SeqFeature
::Gene
::Transcript
->new(-source
=>
195 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
196 $exon_strand, $start_phase, $end_phase, $start_sig_score,
197 $end_sig_score, $coding_pot_score, $homol_score);
199 elsif ($gene_id eq $current_gene_id)
201 # Still in requested prediction
202 $transcript_score += $exon_score;
204 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
205 $exon_strand, $start_phase, $end_phase, $start_sig_score,
206 $end_sig_score, $coding_pot_score, $homol_score);
210 # Found following prediction
211 $self->_pushback($_);
218 $transcript->seq_id($self->_target_id);
219 $transcript->score($transcript_score);
220 $gene->add_transcript($transcript);
221 $gene->seq_id($self->_target_id);
223 foreach my $exon ($gene->exons)
225 $exon->seq_id($self->_target_id);
228 $self->_set_strand($gene);
237 Usage : $obj->_add_exon($gene, $transcript, ... exon data ...)
238 Function: Adds a new exon to both gene and transcript from the data
247 my ($self, $gene, $transcript, $exon_type, $exon_start, $exon_end,
248 $exon_score, $exon_strand, $start_phase, $end_phase, $start_sig_score,
249 $end_sig_score, $coding_pot_score, $homol_score) = @_;
251 $exon_type =~ s/First/Initial/;
253 my $strand = $exon_strand eq '+' ?
1 : -1;
255 my $exon = Bio
::SeqFeature
::Gene
::Exon
->new(-source
=> $SOURCE_TAG,
256 -start
=> $exon_start,
259 -score
=> $exon_score);
261 $exon->add_tag_value("Type", $exon_type);
262 $exon->add_tag_value('phase', $start_phase);
263 $exon->add_tag_value('end_phase', $end_phase);
264 $exon->add_tag_value('start_signal_score', $start_sig_score);
265 $exon->add_tag_value('end_signal_score', $end_sig_score);
266 $exon->add_tag_value('coding_potential_score', $coding_pot_score);
267 $exon->add_tag_value('homology_score', $homol_score);
269 $transcript->strand($strand) unless $transcript->strand != 0;
270 $transcript->add_exon($exon, $exon_type);
276 Usage : $obj->_set_strand($gene)
277 Function: Sets the overall gene strand to the same strand as all
278 : the exons if they are all on the same strand, or to strand 0
279 : if the exons are on different strands.
287 my ($self, $gene) = @_;
292 my @exons = $gene->exons;
293 foreach my $exon (@exons)
295 my $strand = $exon->strand;
301 elsif ($strand == -1)
311 elsif ($#exons == $rev)
326 Usage : $obj->_target_id
327 Function: get/set for genomic sequence id
329 Returns : A target ID
335 my ($self,$val) = @_;
338 $self->{'_target_id'} = $val;
341 return $self->{'_target_id'};