t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / SeqIO / kegg.pm
blob1753295619c05793a4d6a7242d9397ebf755f0fb
2 # BioPerl module for Bio::SeqIO::kegg
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Allen Day <allenday@ucla.edu>
8 # Copyright Allen Day
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqIO::kegg - KEGG sequence input/output stream
18 =head1 SYNOPSIS
20 # It is probably best not to use this object directly, but
21 # rather go through the SeqIO handler system. Go:
23 use Bio::SeqIO;
25 $stream = Bio::SeqIO->new(-file => $filename, -format => 'KEGG');
27 while ( my $seq = $stream->next_seq() ) {
28 # do something with $seq
31 =head1 DESCRIPTION
33 This class transforms KEGG gene records into Bio::Seq objects.
35 =head2 Mapping of record properties to object properties
37 This section is supposed to document which sections and properties of
38 a KEGG databank record end up where in the Bioperl object model. It
39 is far from complete and presently focuses only on those mappings
40 which may be non-obvious. $seq in the text refers to the
41 Bio::Seq::RichSeqI implementing object returned by the parser for each
42 record.
44 =over 4
46 =item 'ENTRY'
48 $seq->primary_id
50 =item 'NAME'
52 $seq->display_id
54 =item 'DEFINITION'
56 $seq->annotation->get_Annotations('description');
58 =item 'ORTHOLOG'
60 grep {$_->database eq 'KO'} $seq->annotation->get_Annotations('dblink')
62 =item 'CLASS'
64 grep {$_->database eq 'PATH'}
65 $seq->annotation->get_Annotations('dblink')
67 =item 'POSITION'
69 FIXME, NOT IMPLEMENTED
71 =item 'PATHWAY'
73 for my $pathway ( $seq->annotation->get_Annotations('pathway') ) {
77 =item 'DBLINKS'
79 $seq->annotation->get_Annotations('dblink')
81 =item 'CODON_USAGE'
83 FIXME, NOT IMPLEMENTED
85 =item 'AASEQ'
87 $seq->translate->seq
89 =item 'NTSEQ'
91 $seq-E<gt>seq
93 =back
95 =head1 FEEDBACK
97 =head2 Mailing Lists
99 User feedback is an integral part of the evolution of this and other
100 Bioperl modules. Send your comments and suggestions preferably to one
101 of the Bioperl mailing lists. Your participation is much appreciated.
103 bioperl-l@bioperl.org - General discussion
104 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
106 =head2 Support
108 Please direct usage questions or support issues to the mailing list:
110 I<bioperl-l@bioperl.org>
112 rather than to the module maintainer directly. Many experienced and
113 reponsive experts will be able look at the problem and quickly
114 address it. Please include a thorough description of the problem
115 with code and data examples if at all possible.
117 =head2 Reporting Bugs
119 Report bugs to the Bioperl bug tracking system to help us keep track
120 the bugs and their resolution. Bug reports can be submitted via the web:
122 https://github.com/bioperl/bioperl-live/issues
124 =head1 AUTHOR - Allen Day
126 Email allenday@ucla.edu
128 =head1 APPENDIX
130 The rest of the documentation details each of the object
131 methods. Internal methods are usually preceded with a _
133 =cut
135 # Let the code begin...
137 package Bio::SeqIO::kegg;
138 use strict;
140 use Bio::SeqFeature::Generic;
141 use Bio::Species;
142 use Bio::Seq::SeqFactory;
143 use Bio::Annotation::Collection;
144 use Bio::Annotation::Comment;
145 use Bio::Annotation::DBLink;
147 use base qw(Bio::SeqIO);
149 sub _initialize {
150 my($self,@args) = @_;
152 $self->SUPER::_initialize(@args);
153 # hash for functions for decoding keys.
154 $self->{'_func_ftunit_hash'} = {};
155 if( ! defined $self->sequence_factory ) {
156 $self->sequence_factory(Bio::Seq::SeqFactory->new
157 (-verbose => $self->verbose(),
158 -type => 'Bio::Seq::RichSeq'));
162 =head2 next_seq
164 Title : next_seq
165 Usage : $seq = $stream->next_seq()
166 Function: returns the next sequence in the stream
167 Returns : Bio::Seq::RichSeq object
168 Args :
170 =cut
172 sub next_seq {
173 my ($self,@args) = @_;
174 my $builder = $self->sequence_builder();
175 my $seq;
176 my %params;
178 my $buffer;
179 my (@acc, @features);
180 my ($display_id, $annotation);
181 my $species;
183 # initialize; we may come here because of starting over
184 @features = ();
185 $annotation = undef;
186 @acc = ();
187 $species = undef;
188 %params = (-verbose => $self->verbose); # reset hash
189 local($/) = "///\n";
191 $buffer = $self->_readline();
193 return if( !defined $buffer ); # end of file
194 $buffer =~ /^ENTRY/ ||
195 $self->throw("KEGG stream with bad ENTRY line. Not KEGG in my book. Got $buffer'");
197 my %FIELDS;
198 my @chunks = split /\n(?=\S)/, $buffer;
200 foreach my $chunk (@chunks){
201 my($key) = $chunk =~ /^(\S+)/;
202 $FIELDS{$key} = $chunk;
205 # changing to split method to get entry_ids that include
206 # sequence version like Whatever.1
207 my(undef,$entry_id,$entry_seqtype,$entry_species) =
208 split(' ',$FIELDS{ENTRY});
210 my($name);
211 if ($FIELDS{NAME}) {
212 ($name) = $FIELDS{NAME} =~ /^NAME\s+(.+)$/;
215 my( $definition, $aa_length, $aa_seq, $nt_length, $nt_seq );
217 if(( exists $FIELDS{DEFINITION} ) and ( $FIELDS{DEFINITION} =~ /^DEFINITION/ )) {
218 ($definition) = $FIELDS{DEFINITION} =~ /^DEFINITION\s+(.+)$/s;
219 $definition =~ s/\s+/ /gs;
221 if(( exists $FIELDS{AASEQ} ) and ( $FIELDS{AASEQ} =~ /^AASEQ/ )) {
222 ($aa_length,$aa_seq) = $FIELDS{AASEQ} =~ /^AASEQ\s+(\d+)\n(.+)$/s;
223 $aa_seq =~ s/\s+//g;
225 if(( exists $FIELDS{NTSEQ} ) and ( $FIELDS{NTSEQ} =~ /^NTSEQ/ )) {
226 ($nt_length,$nt_seq) = $FIELDS{NTSEQ} =~ /^NTSEQ\s+(\d+)\n(.+)$/s;
227 $nt_seq =~ s/\s+//g;
230 $annotation = Bio::Annotation::Collection->new();
232 $annotation->add_Annotation('description',
233 Bio::Annotation::Comment->new(-text => $definition));
235 $annotation->add_Annotation('aa_seq',
236 Bio::Annotation::Comment->new(-text => $aa_seq));
238 my($ortholog_db,$ortholog_id,$ortholog_desc);
239 if ($FIELDS{ORTHOLOG}) {
240 ($ortholog_db,$ortholog_id,$ortholog_desc) = $FIELDS{ORTHOLOG}
241 =~ /^ORTHOLOG\s+(\S+):\s+(\S+)\s+(.*?)$/;
243 $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(
244 -database => $ortholog_db,
245 -primary_id => $ortholog_id,
246 -comment => $ortholog_desc) );
249 if($FIELDS{MOTIF}){
250 $FIELDS{MOTIF} =~ s/^MOTIF\s+//;
251 while($FIELDS{MOTIF} =~/\s*?(\S+):\s+(.+?)$/mg){
252 my $db = $1;
253 my $ids = $2;
254 foreach my $id (split(/\s+/, $ids)){
256 $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(
257 -database =>$db,
258 -primary_id => $id,
259 -comment => "") );
264 if($FIELDS{PATHWAY}) {
265 $FIELDS{PATHWAY} =~ s/^PATHWAY\s+//;
266 while($FIELDS{PATHWAY} =~ /\s*PATH:\s+(.+)$/mg){
267 $annotation->add_Annotation('pathway',
268 Bio::Annotation::Comment->new(-text => "$1"));
272 if($FIELDS{POSITION}) {
273 $FIELDS{POSITION} =~ s/^POSITION\s+//;
274 $annotation->add_Annotation('position',
275 Bio::Annotation::Comment->new(-text => $FIELDS{POSITION}));
278 if ($FIELDS{CLASS}) {
279 $FIELDS{CLASS} =~ s/^CLASS\s+//;
280 $FIELDS{'CLASS'} =~ s/\n//g;
281 while($FIELDS{CLASS} =~ /(.*?)\[(\S+):(\S+)\]/g){
282 my ($pathway,$db,$id) = ($1,$2,$3);
283 $pathway =~ s/\s+/ /g;
284 $pathway =~ s/\s$//g;
285 $pathway =~ s/^\s+//;
286 $annotation->add_Annotation('pathway',
287 Bio::Annotation::Comment->new(-text => $pathway));
289 $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(
290 -database => $db, -primary_id => $id));
294 if($FIELDS{DBLINKS}) {
295 $FIELDS{DBLINKS} =~ s/^DBLINKS/ /;
296 while($FIELDS{DBLINKS} =~ /\s+(\S+):\s+(\S+)\n?/gs){ ### modified
297 $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(
298 -database => $1, -primary_id => $2)) if $1;
302 $params{'-alphabet'} = 'dna';
303 $params{'-seq'} = $nt_seq;
304 $params{'-display_id'} = $name;
305 $params{'-accession_number'} = $entry_id;
306 $params{'-species'} = Bio::Species->new(
307 -common_name => $entry_species);
308 $params{'-annotation'} = $annotation;
310 $builder->add_slot_value(%params);
311 $seq = $builder->make_object();
313 return $seq;
316 =head2 write_seq
318 Title : write_seq
319 Note : write_seq() is not implemented for KEGG format output.
321 =cut
323 sub write_seq {
324 shift->throw("write_seq() not implemented for KEGG format output.");