Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / SeqIO / embldriver.pm
blob5b0903b276fb82f555bbe08ecd24e3139f6e9bfa
2 # BioPerl module for Bio::SeqIO::embldriver
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqIO::embldriver - EMBL sequence input/output stream
18 =head1 SYNOPSIS
20 It is probably best not to use this object directly, but
21 rather go through the SeqIO handler system. Go:
23 $stream = Bio::SeqIO->new(-file => $filename, -format => 'embldriver');
25 while ( (my $seq = $stream->next_seq()) ) {
26 # do something with $seq
29 =head1 DESCRIPTION
31 This object can transform Bio::Seq objects to and from EMBL flat
32 file databases.
34 There is a lot of flexibility here about how to dump things which
35 should be documented more fully.
37 There should be a common object that this and Genbank share (probably
38 with Swissprot). Too much of the magic is identical.
40 =head2 Optional functions
42 =over 3
44 =item _show_dna()
46 (output only) shows the dna or not
48 =item _post_sort()
50 (output only) provides a sorting func which is applied to the FTHelpers
51 before printing
53 =item _id_generation_func()
55 This is function which is called as
57 print "ID ", $func($annseq), "\n";
59 To generate the ID line. If it is not there, it generates a sensible ID
60 line using a number of tools.
62 If you want to output annotations in EMBL format they need to be
63 stored in a Bio::Annotation::Collection object which is accessible
64 through the Bio::SeqI interface method L<annotation()|annotation>.
66 The following are the names of the keys which are polled from a
67 L<Bio::Annotation::Collection> object.
69 reference - Should contain Bio::Annotation::Reference objects
70 comment - Should contain Bio::Annotation::Comment objects
71 dblink - Should contain Bio::Annotation::DBLink objects
73 =back
75 =head1 FEEDBACK
77 =head2 Mailing Lists
79 User feedback is an integral part of the evolution of this and other
80 Bioperl modules. Send your comments and suggestions preferably to one
81 of the Bioperl mailing lists. Your participation is much appreciated.
83 bioperl-l@bioperl.org - General discussion
84 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
86 =head2 Support
88 Please direct usage questions or support issues to the mailing list:
90 I<bioperl-l@bioperl.org>
92 rather than to the module maintainer directly. Many experienced and
93 reponsive experts will be able look at the problem and quickly
94 address it. Please include a thorough description of the problem
95 with code and data examples if at all possible.
97 =head2 Reporting Bugs
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution. Bug reports can be submitted via
101 the web:
103 https://github.com/bioperl/bioperl-live/issues
105 =head1 AUTHOR - Ewan Birney
107 Email birney@ebi.ac.uk
109 =head1 APPENDIX
111 The rest of the documentation details each of the object
112 methods. Internal methods are usually preceded with a _
114 =cut
116 # Let the code begin...
118 package Bio::SeqIO::embldriver;
120 use vars qw(%FTQUAL_NO_QUOTE);
121 use strict;
122 use Bio::SeqIO::Handler::GenericRichSeqHandler;
123 use Data::Dumper;
125 use base qw(Bio::SeqIO);
127 my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
128 anticodon citation
129 codon codon_start
130 compare cons_splice
131 direction estimated_length
132 evidence label
133 mod_base number
134 rpt_type rpt_unit
135 rpt_unit_range tag_peptide
136 transl_except transl_table
137 usedin
138 LOCATION
141 my %DATA_KEY = (
142 ID => 'ID',
143 AC => 'ACCESSION',
144 DT => 'DATE',
145 DE => 'DESCRIPTION',
146 KW => 'KEYWORDS',
147 OS => 'SOURCE',
148 OC => 'CLASSIFICATION',
149 OG => 'ORGANELLE',
150 RN => 'REFERENCE',
151 RA => 'AUTHORS',
152 RC => 'COMMENT',
153 RG => 'CONSRTM',
154 RP => 'POSITION',
155 RX => 'CROSSREF',
156 RT => 'TITLE',
157 RL => 'LOCATION',
158 XX => 'SPACER',
159 FH => 'FEATHEADER',
160 FT => 'FEATURES',
161 AH => 'TPA_HEADER', # Third party annotation
162 AS => 'TPA_DATA', # Third party annotation
163 DR => 'DBLINK',
164 CC => 'COMMENT',
165 CO => 'CO',
166 CON => 'CON',
167 WGS => 'WGS',
168 ANN => 'ANN',
169 TPA => 'TPA',
170 SQ => 'SEQUENCE',
173 my %SEC = (
174 OC => 'CLASSIFICATION',
175 OH => 'HOST', # not currently handled, bundled with organism data for now
176 OG => 'ORGANELLE',
177 OX => 'CROSSREF',
178 RA => 'AUTHORS',
179 RC => 'COMMENT',
180 RG => 'CONSRTM',
181 RP => 'POSITION',
182 RX => 'CROSSREF',
183 RT => 'TITLE',
184 RL => 'JOURNAL',
185 AS => 'ASSEMBLYINFO', # Third party annotation
188 my %DELIM = (
189 #CC => "\n",
190 #DR => "\n",
191 #DT => "\n",
194 # signals to process what's in the hash prior to next round
195 # these should be changed to map secondary data
196 my %PRIMARY = map {$_ => 1} qw(ID AC DT DE SV KW OS RN AH DR FH CC SQ FT WGS CON ANN TPA //);
198 sub _initialize {
199 my($self,@args) = @_;
201 $self->SUPER::_initialize(@args);
202 my $handler = $self->_rearrange([qw(HANDLER)],@args);
203 # hash for functions for decoding keys.
204 $handler ? $self->seqhandler($handler) :
205 $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
206 -format => 'embl',
207 -verbose => $self->verbose,
208 -builder => $self->sequence_builder
211 if( ! defined $self->sequence_factory ) {
212 $self->sequence_factory(Bio::Seq::SeqFactory->new
213 (-verbose => $self->verbose(),
214 -type => 'Bio::Seq::RichSeq'));
218 =head2 next_seq
220 Title : next_seq
221 Usage : $seq = $stream->next_seq()
222 Function: returns the next sequence in the stream
223 Returns : Bio::Seq object
224 Args :
226 =cut
228 sub next_seq {
229 my $self = shift;
230 my $hobj = $self->seqhandler;
231 local($/) = "\n";
232 my ($featkey, $qual, $annkey, $delim, $seqdata);
233 my $lastann = '';
234 my $ct = 0;
235 PARSER:
236 while(defined(my $line = $self->_readline)) {
237 next PARSER if $line =~ m{^\s*$};
238 chomp $line;
239 my ($ann,$data) = split m{\s{2,3}}, $line , 2;
240 next PARSER if ($ann eq 'XX' || $ann eq 'FH');
241 if ($ann) {
242 $data ||='';
243 if ($ann eq 'FT') {
244 # seqfeatures
245 if ($data =~ m{^(\S+)\s+([^\n]+)}) {
246 $hobj->data_handler($seqdata) if $seqdata;
247 $seqdata = ();
248 ($seqdata->{FEATURE_KEY}, $data) = ($1, $2);
249 $seqdata->{NAME} = $ann;
250 $qual = 'LOCATION';
251 } elsif ($data =~ m{^\s+/([^=]+)=?(.+)?}) {
252 ($qual, $data) = ($1, $2 ||'');
253 $ct = (exists $seqdata->{$qual}) ?
254 ((ref($seqdata->{$qual})) ? scalar(@{ $seqdata->{$qual} }) : 1)
255 : 0 ;
257 $data =~ s{^\s+}{};
258 $data =~ tr{"}{}d; # we don't care about quotes yet...
259 my $delim = ($FTQUAL_NO_QUOTE{$qual}) ? '' : ' ';
260 if ($ct == 0) {
261 $seqdata->{$qual} .= ($seqdata->{$qual}) ?
262 $delim.$data :
263 $data;
264 } else {
265 if (!ref($seqdata->{$qual})) {
266 $seqdata->{$qual} = [$seqdata->{$qual}];
268 (exists $seqdata->{$qual}->[$ct]) ?
269 (($seqdata->{$qual}->[$ct]) .= $delim.$data) :
270 (($seqdata->{$qual}->[$ct]) .= $data);
272 } else {
273 # simple annotations
274 $data =~ s{;$}{};
275 last PARSER if $ann eq '//';
276 if ($ann ne $lastann) {
277 if (!$SEC{$ann} && $seqdata) {
278 $hobj->data_handler($seqdata);
279 # can't use undef here; it can lead to subtle mem leaks
280 $seqdata = ();
282 $annkey = (!$SEC{$ann}) ? 'DATA' : # primary data
283 $SEC{$ann};
284 $seqdata->{'NAME'} = $ann if !$SEC{$ann};
287 # toss the data for SQ lines; this needs to be done after the
288 # call to the data handler
290 next PARSER if $ann eq 'SQ';
291 my $delim = $DELIM{$ann} || ' ';
292 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ?
293 $delim.$data : $data;
294 $lastann = $ann;
296 } else {
297 # this should only be sequence (fingers crossed!)
298 SEQUENCE:
299 while (defined ($line = $self->_readline)) {
300 if (index($line, '//') == 0) {
301 $data =~ tr{0-9 \n}{}d;
302 $seqdata->{DATA} = $data;
303 #$self->debug(Dumper($seqdata));
304 $hobj->data_handler($seqdata);
305 $seqdata = ();
306 last PARSER;
307 } else {
308 $data .= $line;
309 $line = undef;
314 $hobj->data_handler($seqdata) if $seqdata;
315 $seqdata = ();
316 return $hobj->build_sequence;
319 sub next_chunk {
320 my $self = shift;
321 my $ct = 0;
322 PARSER:
323 while(defined(my $line = $self->_readline)) {
324 next if $line =~ m{^\s*$};
325 chomp $line;
326 my ($ann,$data) = split m{\s{2,3}}, $line , 2;
327 $data ||= '';
328 $self->debug("Ann: [$ann]\n\tData: [$data]\n");
329 last PARSER if $ann =~ m{//};
333 =head2 write_seq
335 Title : write_seq
336 Usage : $stream->write_seq($seq)
337 Function: writes the $seq object (must be seq) to the stream
338 Returns : 1 for success and 0 for error
339 Args : array of 1 to n Bio::SeqI objects
341 =cut
343 sub write_seq {
344 shift->throw("Use Bio::SeqIO::embl for output");
345 # maybe make a Writer class as well????
348 =head2 seqhandler
350 Title : seqhandler
351 Usage : $stream->seqhandler($handler)
352 Function: Get/Set the Bio::Seq::HandlerBaseI object
353 Returns : Bio::Seq::HandlerBaseI
354 Args : Bio::Seq::HandlerBaseI
356 =cut
358 sub seqhandler {
359 my ($self, $handler) = @_;
360 if ($handler) {
361 $self->throw("Not a Bio::HandlerBaseI") unless
362 ref($handler) && $handler->isa("Bio::HandlerBaseI");
363 $self->{'_seqhandler'} = $handler;
365 return $self->{'_seqhandler'};
370 __END__