Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / Geneid.pm
blobe0f66835ba5ec67e98d6b6fe7fad296e15771246
2 # Please direct questions and support issues to <bioperl-l@bioperl.org>
4 # Cared for by Keith James
6 # Copyright Genome Research Ltd.
8 # You may distribute this module under the same terms as Perl itself
10 # POD documentation - main docs before the code
13 =encoding utf-8
15 =head1 NAME
17 Bio::Tools::Geneid - Results of one geneid run
19 =head1 SYNOPSIS
21 use Bio::Tools::Geneid;
22 my $gid = Bio::Tools::Geneid(-file => "geneid.out");
24 while (my $gene = $gid->next_prediction)
26 my @transcripts = $gene->transcripts;
27 foreach my $t (@transcripts)
29 my @exons = $t->exons;
30 foreach my $e (@exons)
32 printf("Exon %d..%d\n", $e->start, $e->end);
37 =head1 DESCRIPTION
39 This is the parser for the output of geneid by Enrique Blanco and
40 Roderic GuigE<243> (IMIM-UPF). See http://www1.imim.es/software/geneid. It
41 relies on native geneid output format internally and will work with
42 geneid versions 1.0 and 1.1. Currently this module supports only the
43 default mode of operation which is to predict exons and assemble an
44 optimal gene prediction.
46 It takes either a file handle or a file name and returns a
47 Bio::SeqFeature::Gene::GeneStructure object.
49 =head1 FEEDBACK
51 =head2 Mailing Lists
53 User feedback is an integral part of the evolution of this and other
54 Bioperl modules. Send your comments and suggestions preferably to one
55 of the Bioperl mailing lists. Your participation is much appreciated.
57 bioperl-l@bioperl.org - General discussion
58 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
60 =head2 Support
62 Please direct usage questions or support issues to the mailing list:
64 I<bioperl-l@bioperl.org>
66 rather than to the module maintainer directly. Many experienced and
67 reponsive experts will be able look at the problem and quickly
68 address it. Please include a thorough description of the problem
69 with code and data examples if at all possible.
71 =head2 Reporting Bugs
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 the bugs and their resolution. Bug reports can be submitted via the
75 web:
77 https://github.com/bioperl/bioperl-live/issues
79 =head1 AUTHOR - Keith James
81 Email: kdj@sanger.ac.uk
83 =head1 APPENDIX
85 The rest of the documentation details each of the object methods.
86 Internal methods are usually preceded with a _
88 =cut
90 package Bio::Tools::Geneid;
92 use vars qw($SOURCE_TAG);
93 use strict;
95 use Bio::Tools::AnalysisResult;
96 use Bio::SeqFeature::Generic;
97 use Bio::SeqFeature::Gene::Exon;
98 use Bio::SeqFeature::Gene::Transcript;
99 use Bio::SeqFeature::Gene::GeneStructure;
101 use base qw(Bio::Root::Root Bio::Root::IO);
102 $SOURCE_TAG = 'geneid';
104 =head2 new
106 Title : new
107 Usage : $obj->new(-file = "<geneid.out");
108 $obj->new(-fh => \*GI);
109 Function: Constructor for geneid wrapper. Takes either a file
110 : or filehandle
111 Returns : L<Bio::Tools::Geneid>
113 =cut
115 sub new
117 my($class, @args) = @_;
118 my $self = $class->SUPER::new(@args);
119 $self->_initialize_io(@args);
120 return $self;
123 =head2 next_prediction
125 Title : next_prediction
126 Usage : while($gene = $geneid->next_prediction)
128 # do something
130 Function: Returns the gene structure prediction of the geneid result
131 file. Call this method repeatedly until FALSE is returned.
132 Returns : A Bio::SeqFeature::Gene::GeneStructure object
133 Args : None
135 =cut
137 sub next_prediction
139 my ($self) = @_;
141 my ($gene, $transcript, $current_gene_id);
142 my $transcript_score = 0;
144 my ($gene_id, $exon_type, $exon_start, $exon_end, $exon_score,
145 $exon_strand, $start_phase, $end_phase, $start_sig_score,
146 $end_sig_score, $coding_pot_score, $homol_score);
148 while (defined($_ = $self->_readline))
150 $self->debug($_);
152 s/^\s+//;
153 s/\s+$//;
155 # We have a choice of geneid, gff or XML formats. The native
156 # geneid format has more information than gff. However, we
157 # then need to perform the hack of extracting the sequence ID
158 # from the header of the embedded Fasta file which comes after
159 # the exon data, as it is not stored elsewhere. Ack.
160 # the new version of geneID includes the sequence ID in a slightly
161 # different format and a new "or" statement was added below
162 # also removed "unless defined $self->_target_id;" inorder to continue
163 # generating new sequence IDs.
165 if (/^>(\S+)\|GeneId/ or /^# Sequence (\S+)/)
167 my $target_id = $1;
168 $self->_target_id($target_id);
169 next;
172 next unless (/(Single|First|Internal|Terminal)/);
174 my @fields = split(/\s+/, $_);
176 # Grab gene_id from eol first as there are issues with
177 # inconsistent whitespace in the AA coords field
178 $gene_id = pop @fields;
180 ($exon_type, $exon_start, $exon_end, $exon_score,
181 $exon_strand, $start_phase, $end_phase, $start_sig_score,
182 $end_sig_score, $coding_pot_score, $homol_score) = @fields[0..10];
184 if (! defined $current_gene_id)
186 # Starting the requested prediction
187 $current_gene_id = $gene_id;
188 $transcript_score = $exon_score;
190 $gene = Bio::SeqFeature::Gene::GeneStructure->new(-source =>
191 $SOURCE_TAG);
192 $transcript = Bio::SeqFeature::Gene::Transcript->new(-source =>
193 $SOURCE_TAG);
195 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
196 $exon_strand, $start_phase, $end_phase, $start_sig_score,
197 $end_sig_score, $coding_pot_score, $homol_score);
199 elsif ($gene_id eq $current_gene_id)
201 # Still in requested prediction
202 $transcript_score += $exon_score;
204 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
205 $exon_strand, $start_phase, $end_phase, $start_sig_score,
206 $end_sig_score, $coding_pot_score, $homol_score);
208 else
210 # Found following prediction
211 $self->_pushback($_);
212 last;
216 if (defined $gene)
218 $transcript->seq_id($self->_target_id);
219 $transcript->score($transcript_score);
220 $gene->add_transcript($transcript);
221 $gene->seq_id($self->_target_id);
223 foreach my $exon ($gene->exons)
225 $exon->seq_id($self->_target_id);
228 $self->_set_strand($gene);
231 return $gene;
234 =head2 _add_exon
236 Title : _add_exon
237 Usage : $obj->_add_exon($gene, $transcript, ... exon data ...)
238 Function: Adds a new exon to both gene and transcript from the data
239 : supplied as args
240 Example :
241 Returns : Nothing
243 =cut
245 sub _add_exon
247 my ($self, $gene, $transcript, $exon_type, $exon_start, $exon_end,
248 $exon_score, $exon_strand, $start_phase, $end_phase, $start_sig_score,
249 $end_sig_score, $coding_pot_score, $homol_score) = @_;
251 $exon_type =~ s/First/Initial/;
253 my $strand = $exon_strand eq '+' ? 1 : -1;
255 my $exon = Bio::SeqFeature::Gene::Exon->new(-source => $SOURCE_TAG,
256 -start => $exon_start,
257 -end => $exon_end,
258 -strand => $strand,
259 -score => $exon_score);
260 $exon->is_coding(1);
261 $exon->add_tag_value("Type", $exon_type);
262 $exon->add_tag_value('phase', $start_phase);
263 $exon->add_tag_value('end_phase', $end_phase);
264 $exon->add_tag_value('start_signal_score', $start_sig_score);
265 $exon->add_tag_value('end_signal_score', $end_sig_score);
266 $exon->add_tag_value('coding_potential_score', $coding_pot_score);
267 $exon->add_tag_value('homology_score', $homol_score);
269 $transcript->strand($strand) unless $transcript->strand != 0;
270 $transcript->add_exon($exon, $exon_type);
273 =head2 _set_strand
275 Title : _set_strand
276 Usage : $obj->_set_strand($gene)
277 Function: Sets the overall gene strand to the same strand as all
278 : the exons if they are all on the same strand, or to strand 0
279 : if the exons are on different strands.
280 Example :
281 Returns : Nothing
283 =cut
285 sub _set_strand
287 my ($self, $gene) = @_;
289 my $fwd = 0;
290 my $rev = 0;
292 my @exons = $gene->exons;
293 foreach my $exon (@exons)
295 my $strand = $exon->strand;
297 if ($strand == 1)
299 $fwd++;
301 elsif ($strand == -1)
303 $rev++;
307 if ($#exons == $fwd)
309 $gene->strand(1);
311 elsif ($#exons == $rev)
313 $gene->strand(-1);
315 else
317 $gene->strand(0);
320 return $gene;
323 =head2 _target_id
325 Title : _target_id
326 Usage : $obj->_target_id
327 Function: get/set for genomic sequence id
328 Example :
329 Returns : A target ID
331 =cut
333 sub _target_id
335 my ($self,$val) = @_;
336 if ($val)
338 $self->{'_target_id'} = $val;
341 return $self->{'_target_id'};