Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / SeqIO / embldriver.pm
blob1712ae4c247098a24a3da95272893cf558c65b46
2 # BioPerl module for Bio::SeqIO::embldriver
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqIO::embldriver - EMBL sequence input/output stream
18 =head1 SYNOPSIS
20 It is probably best not to use this object directly, but
21 rather go through the SeqIO handler system. Go:
23 $stream = Bio::SeqIO->new(-file => $filename, -format => 'embldriver');
25 while ( (my $seq = $stream->next_seq()) ) {
26 # do something with $seq
29 =head1 DESCRIPTION
31 This object can transform Bio::Seq objects to and from EMBL flat
32 file databases.
34 There is a lot of flexibility here about how to dump things which
35 should be documented more fully.
37 There should be a common object that this and Genbank share (probably
38 with Swissprot). Too much of the magic is identical.
40 =head2 Optional functions
42 =over 3
44 =item _show_dna()
46 (output only) shows the dna or not
48 =item _post_sort()
50 (output only) provides a sorting func which is applied to the FTHelpers
51 before printing
53 =item _id_generation_func()
55 This is function which is called as
57 print "ID ", $func($annseq), "\n";
59 To generate the ID line. If it is not there, it generates a sensible ID
60 line using a number of tools.
62 If you want to output annotations in EMBL format they need to be
63 stored in a Bio::Annotation::Collection object which is accessible
64 through the Bio::SeqI interface method L<annotation()|annotation>.
66 The following are the names of the keys which are polled from a
67 L<Bio::Annotation::Collection> object.
69 reference - Should contain Bio::Annotation::Reference objects
70 comment - Should contain Bio::Annotation::Comment objects
71 dblink - Should contain Bio::Annotation::DBLink objects
73 =back
75 =head1 FEEDBACK
77 =head2 Mailing Lists
79 User feedback is an integral part of the evolution of this and other
80 Bioperl modules. Send your comments and suggestions preferably to one
81 of the Bioperl mailing lists. Your participation is much appreciated.
83 bioperl-l@bioperl.org - General discussion
84 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
86 =head2 Support
88 Please direct usage questions or support issues to the mailing list:
90 I<bioperl-l@bioperl.org>
92 rather than to the module maintainer directly. Many experienced and
93 reponsive experts will be able look at the problem and quickly
94 address it. Please include a thorough description of the problem
95 with code and data examples if at all possible.
97 =head2 Reporting Bugs
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution. Bug reports can be submitted via
101 the web:
103 https://github.com/bioperl/bioperl-live/issues
105 =head1 AUTHOR - Ewan Birney
107 Email birney@ebi.ac.uk
109 =head1 APPENDIX
111 The rest of the documentation details each of the object
112 methods. Internal methods are usually preceded with a _
114 =cut
116 # Let the code begin...
118 package Bio::SeqIO::embldriver;
119 use vars qw(%FTQUAL_NO_QUOTE);
120 use strict;
121 use Bio::SeqIO::Handler::GenericRichSeqHandler;
122 use Data::Dumper;
124 use base qw(Bio::SeqIO);
126 my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
127 anticodon citation
128 codon codon_start
129 cons_splice direction
130 evidence label
131 mod_base number
132 rpt_type rpt_unit
133 transl_except transl_table
134 usedin
135 LOCATION
138 my %DATA_KEY = (
139 ID => 'ID',
140 AC => 'ACCESSION',
141 DT => 'DATE',
142 DE => 'DESCRIPTION',
143 KW => 'KEYWORDS',
144 OS => 'SOURCE',
145 OC => 'CLASSIFICATION',
146 OG => 'ORGANELLE',
147 RN => 'REFERENCE',
148 RA => 'AUTHORS',
149 RC => 'COMMENT',
150 RG => 'CONSRTM',
151 RP => 'POSITION',
152 RX => 'CROSSREF',
153 RT => 'TITLE',
154 RL => 'LOCATION',
155 XX => 'SPACER',
156 FH => 'FEATHEADER',
157 FT => 'FEATURES',
158 AH => 'TPA_HEADER', # Third party annotation
159 AS => 'TPA_DATA', # Third party annotation
160 DR => 'DBLINK',
161 CC => 'COMMENT',
162 CO => 'CO',
163 CON => 'CON',
164 WGS => 'WGS',
165 ANN => 'ANN',
166 TPA => 'TPA',
167 SQ => 'SEQUENCE',
170 my %SEC = (
171 OC => 'CLASSIFICATION',
172 OH => 'HOST', # not currently handled, bundled with organism data for now
173 OG => 'ORGANELLE',
174 OX => 'CROSSREF',
175 RA => 'AUTHORS',
176 RC => 'COMMENT',
177 RG => 'CONSRTM',
178 RP => 'POSITION',
179 RX => 'CROSSREF',
180 RT => 'TITLE',
181 RL => 'JOURNAL',
182 AS => 'ASSEMBLYINFO', # Third party annotation
185 my %DELIM = (
186 #CC => "\n",
187 #DR => "\n",
188 #DT => "\n",
191 # signals to process what's in the hash prior to next round
192 # these should be changed to map secondary data
193 my %PRIMARY = map {$_ => 1} qw(ID AC DT DE SV KW OS RN AH DR FH CC SQ FT WGS CON ANN TPA //);
195 sub _initialize {
196 my($self,@args) = @_;
198 $self->SUPER::_initialize(@args);
199 my $handler = $self->_rearrange([qw(HANDLER)],@args);
200 # hash for functions for decoding keys.
201 $handler ? $self->seqhandler($handler) :
202 $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
203 -format => 'embl',
204 -verbose => $self->verbose,
205 -builder => $self->sequence_builder
208 if( ! defined $self->sequence_factory ) {
209 $self->sequence_factory(Bio::Seq::SeqFactory->new
210 (-verbose => $self->verbose(),
211 -type => 'Bio::Seq::RichSeq'));
215 =head2 next_seq
217 Title : next_seq
218 Usage : $seq = $stream->next_seq()
219 Function: returns the next sequence in the stream
220 Returns : Bio::Seq object
221 Args :
223 =cut
225 sub next_seq {
226 my $self = shift;
227 my $hobj = $self->seqhandler;
228 local($/) = "\n";
229 my ($featkey, $qual, $annkey, $delim, $seqdata);
230 my $lastann = '';
231 my $ct = 0;
232 PARSER:
233 while(defined(my $line = $self->_readline)) {
234 next PARSER if $line =~ m{^\s*$};
235 chomp $line;
236 my ($ann,$data) = split m{\s{2,3}}, $line , 2;
237 next PARSER if ($ann eq 'XX' || $ann eq 'FH');
238 if ($ann) {
239 $data ||='';
240 if ($ann eq 'FT') {
241 # seqfeatures
242 if ($data =~ m{^(\S+)\s+([^\n]+)}) {
243 $hobj->data_handler($seqdata) if $seqdata;
244 $seqdata = ();
245 ($seqdata->{FEATURE_KEY}, $data) = ($1, $2);
246 $seqdata->{NAME} = $ann;
247 $qual = 'LOCATION';
248 } elsif ($data =~ m{^\s+/([^=]+)=?(.+)?}) {
249 ($qual, $data) = ($1, $2 ||'');
250 $ct = (exists $seqdata->{$qual}) ?
251 ((ref($seqdata->{$qual})) ? scalar(@{ $seqdata->{$qual} }) : 1)
252 : 0 ;
254 $data =~ s{^\s+}{};
255 $data =~ tr{"}{}d; # we don't care about quotes yet...
256 my $delim = ($FTQUAL_NO_QUOTE{$qual}) ? '' : ' ';
257 if ($ct == 0) {
258 $seqdata->{$qual} .= ($seqdata->{$qual}) ?
259 $delim.$data :
260 $data;
261 } else {
262 if (!ref($seqdata->{$qual})) {
263 $seqdata->{$qual} = [$seqdata->{$qual}];
265 (exists $seqdata->{$qual}->[$ct]) ?
266 (($seqdata->{$qual}->[$ct]) .= $delim.$data) :
267 (($seqdata->{$qual}->[$ct]) .= $data);
269 } else {
270 # simple annotations
271 $data =~ s{;$}{};
272 last PARSER if $ann eq '//';
273 if ($ann ne $lastann) {
274 if (!$SEC{$ann} && $seqdata) {
275 $hobj->data_handler($seqdata);
276 # can't use undef here; it can lead to subtle mem leaks
277 $seqdata = ();
279 $annkey = (!$SEC{$ann}) ? 'DATA' : # primary data
280 $SEC{$ann};
281 $seqdata->{'NAME'} = $ann if !$SEC{$ann};
284 # toss the data for SQ lines; this needs to be done after the
285 # call to the data handler
287 next PARSER if $ann eq 'SQ';
288 my $delim = $DELIM{$ann} || ' ';
289 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ?
290 $delim.$data : $data;
291 $lastann = $ann;
293 } else {
294 # this should only be sequence (fingers crossed!)
295 SEQUENCE:
296 while (defined ($line = $self->_readline)) {
297 if (index($line, '//') == 0) {
298 $data =~ tr{0-9 \n}{}d;
299 $seqdata->{DATA} = $data;
300 #$self->debug(Dumper($seqdata));
301 $hobj->data_handler($seqdata);
302 $seqdata = ();
303 last PARSER;
304 } else {
305 $data .= $line;
306 $line = undef;
311 $hobj->data_handler($seqdata) if $seqdata;
312 $seqdata = ();
313 return $hobj->build_sequence;
316 sub next_chunk {
317 my $self = shift;
318 my $ct = 0;
319 PARSER:
320 while(defined(my $line = $self->_readline)) {
321 next if $line =~ m{^\s*$};
322 chomp $line;
323 my ($ann,$data) = split m{\s{2,3}}, $line , 2;
324 $data ||= '';
325 $self->debug("Ann: [$ann]\n\tData: [$data]\n");
326 last PARSER if $ann =~ m{//};
330 =head2 write_seq
332 Title : write_seq
333 Usage : $stream->write_seq($seq)
334 Function: writes the $seq object (must be seq) to the stream
335 Returns : 1 for success and 0 for error
336 Args : array of 1 to n Bio::SeqI objects
338 =cut
340 sub write_seq {
341 shift->throw("Use Bio::SeqIO::embl for output");
342 # maybe make a Writer class as well????
345 =head2 seqhandler
347 Title : seqhandler
348 Usage : $stream->seqhandler($handler)
349 Function: Get/Set the Bio::Seq::HandlerBaseI object
350 Returns : Bio::Seq::HandlerBaseI
351 Args : Bio::Seq::HandlerBaseI
353 =cut
355 sub seqhandler {
356 my ($self, $handler) = @_;
357 if ($handler) {
358 $self->throw("Not a Bio::HandlerBaseI") unless
359 ref($handler) && $handler->isa("Bio::HandlerBaseI");
360 $self->{'_seqhandler'} = $handler;
362 return $self->{'_seqhandler'};
367 __END__