Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / SeqIO / swissdriver.pm
blob408e2e634cfdc74d231424c37044ec830db98c20
2 # BioPerl module for Bio::SeqIO::swissdriver
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Bioperl project bioperl-l(at)bioperl.org
8 # Copyright Chris Fields and contributors see AUTHORS section
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqIO::swissdriver - SwissProt/UniProt handler-based push parser
18 =head1 SYNOPSIS
20 #It is probably best not to use this object directly, but
21 #rather go through the SeqIO handler:
23 $stream = Bio::SeqIO->new(-file => $filename,
24 -format => 'swissdriver');
26 while ( my $seq = $stream->next_seq() ) {
27 # do something with $seq
30 =head1 DESCRIPTION
32 This object can transform Bio::Seq objects to and from UniProt flat file
33 databases. The key difference between this parser and the tried-and-true
34 Bio::SeqIO::swiss parser is this version separates the parsing and data
35 manipulation into a 'driver' method (next_seq) and separate object handlers
36 which deal with the data passed to it.
38 =head2 The Driver
40 The main purpose of the driver routine, in this case next_seq(), is to carve out
41 the data into meaningful chunks which are passed along to relevant handlers (see
42 below).
44 Each chunk of data in the has a NAME tag attached to it, similar to that for XML
45 parsing. This designates the type of data passed (annotation type or seqfeature)
46 and the handler to be called for processing the data.
48 =head1 FEEDBACK
50 =head2 Mailing Lists
52 User feedback is an integral part of the evolution of this and other
53 Bioperl modules. Send your comments and suggestions preferably to one
54 of the Bioperl mailing lists. Your participation is much appreciated.
56 bioperl-l@bioperl.org - General discussion
57 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
59 =head2 Support
61 Please direct usage questions or support issues to the mailing list:
63 I<bioperl-l@bioperl.org>
65 rather than to the module maintainer directly. Many experienced and
66 reponsive experts will be able look at the problem and quickly
67 address it. Please include a thorough description of the problem
68 with code and data examples if at all possible.
70 =head2 Reporting Bugs
72 Report bugs to the Bioperl bug tracking system to help us keep track
73 the bugs and their resolution. Bug reports can be submitted via the web:
75 https://github.com/bioperl/bioperl-live/issues
77 =head1 AUTHOR - Bioperl Project
79 bioperl-l at bioperl.org
81 =head1 APPENDIX
83 The rest of the documentation details each of the object
84 methods. Internal methods are usually preceded with a _
86 =cut
88 # POD is at the end of the module
90 # Let the code begin...
92 # Let the code begin...
94 package Bio::SeqIO::swissdriver;
95 use vars qw(%FTQUAL_NO_QUOTE);
96 use strict;
97 use Bio::SeqIO::Handler::GenericRichSeqHandler;
98 use Data::Dumper;
100 use base qw(Bio::SeqIO);
102 # signals to process what's in the hash prior to next round, maps ann => names
103 my %SEC = (
104 OC => 'CLASSIFICATION',
105 OH => 'HOST', # not currently handled, bundled with organism data for now
106 OG => 'ORGANELLE',
107 OX => 'CROSSREF',
108 RA => 'AUTHORS',
109 RC => 'COMMENT',
110 RG => 'CONSRTM',
111 RP => 'POSITION',
112 RX => 'CROSSREF',
113 RT => 'TITLE',
114 RL => 'JOURNAL',
115 AS => 'ASSEMBLYINFO', # Third party annotation
116 '//' => 'RECORDEND'
119 # add specialized delimiters here for easier postprocessing
120 my %DELIM = (
121 CC => "\n",
122 DR => "\n",
123 DT => "\n",
126 sub _initialize {
127 my($self,@args) = @_;
129 $self->SUPER::_initialize(@args);
130 my $handler = $self->_rearrange([qw(HANDLER)],@args);
131 # hash for functions for decoding keys.
132 $handler ? $self->seqhandler($handler) :
133 $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
134 -format => 'swiss',
135 -verbose => $self->verbose,
136 -builder => $self->sequence_builder
138 if( ! defined $self->sequence_factory ) {
139 $self->sequence_factory(Bio::Seq::SeqFactory->new
140 (-verbose => $self->verbose(),
141 -type => 'Bio::Seq::RichSeq'));
145 =head2 next_seq
147 Title : next_seq
148 Usage : $seq = $stream->next_seq()
149 Function: returns the next sequence in the stream
150 Returns : Bio::Seq object
151 Args : none
153 =cut
155 sub next_seq {
156 my $self = shift;
157 my $hobj = $self->seqhandler;
158 local($/) = "\n";
159 # these contain values that need to carry over each round
160 my ($featkey, $qual, $annkey, $seqdata, $location);
161 my $lastann = '';
162 my $ct = 0;
163 # main parser
164 PARSER:
165 while(defined(my $line = $self->_readline)) {
166 chomp $line;
167 my ($ann, $data) = split(m{\s+}, $line, 2);
168 if ($ann) {
169 if ($ann eq 'FT') {
170 # sequence features
171 if ($data =~ m{^(\w+)\s+([\d\?\<]+)\s+([\d\?\>]+)(?:\s+?(\S.*))?}ox) {
172 # has location data and desc
173 if ($seqdata) {
174 $hobj->data_handler($seqdata);
175 $seqdata = ();
177 ($seqdata->{FEATURE_KEY}, my $loc1, my $loc2, $data) = ($1, $2, $3, $4);
178 $qual = 'description';
179 $seqdata->{$qual} = $data;
180 $seqdata->{NAME} = $ann;
181 $seqdata->{LOCATION} = "$loc1..$loc2" if defined $loc1;
182 next PARSER;
183 } elsif ($data =~ m{^\s+/([^=]+)(?:=(.+))?}ox) {
184 # has qualifer
185 ($qual, $data) = ($1, $2 || '');
186 $ct = ($seqdata->{$qual}) ?
187 ((ref($seqdata->{$qual})) ? scalar(@{ $seqdata->{$qual} }) : 1)
188 : 0 ;
190 $data =~ s{\.$}{};
191 if ($ct == 0) {
192 $seqdata->{$qual} .= ($seqdata->{$qual}) ?
193 ' '.$data : $data;
194 } else {
195 if (!ref($seqdata->{$qual})) {
196 $seqdata->{$qual} = [$seqdata->{$qual}];
198 ($seqdata->{$qual}->[$ct]) ?
199 ($seqdata->{$qual}->[$ct] .= ' '.$data) :
200 ($seqdata->{$qual}->[$ct] .= $data);
202 } else {
203 # simple annotations
204 if ($ann ne $lastann) {
205 if (!$SEC{$ann} && $seqdata) {
206 $hobj->data_handler($seqdata);
207 # can't use undef here; it can lead to subtle mem leaks
208 $seqdata = ();
210 $annkey = (!$SEC{$ann}) ? 'DATA' : # primary data
211 $SEC{$ann};
212 $seqdata->{'NAME'} = $ann if !$SEC{$ann};
214 last PARSER if $ann eq '//';
215 next PARSER if $ann eq 'SQ';
216 my $delim = $DELIM{$ann} || ' ';
217 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ?
218 $delim.$data : $data;
219 $lastann = $ann;
221 } else {
222 # this should only be sequence (fingers crossed!)
223 SEQUENCE:
224 while (defined ($line = $self->_readline)) {
225 if (index($line, '//') == 0) {
226 $data =~ tr{0-9 \n}{}d;
227 $seqdata->{DATA} = $data;
228 #$self->debug(Dumper($seqdata));
229 $hobj->data_handler($seqdata);
230 $seqdata = ();
231 last PARSER;
232 } else {
233 $data .= $line;
234 $line = undef;
239 # some files have no // for the last file; this catches the last bit o' data
240 $hobj->data_handler($seqdata) if $seqdata;
241 return $hobj->build_sequence;
244 =head2 write_seq
246 Title : write_seq
247 Usage : $stream->write_seq($seq)
248 Function: writes the $seq object (must be seq) to the stream
249 Returns : 1 for success and 0 for error
250 Args : array of 1 to n Bio::SeqI objects
252 =cut
254 sub write_seq {
255 shift->throw("Use Bio::SeqIO::swiss write_seq() for output");
256 # maybe make a Writer class as well????
259 =head2 seqhandler
261 Title : seqhandler
262 Usage : $stream->seqhandler($handler)
263 Function: Get/Set the Bio::Seq::HandlerBaseI object
264 Returns : Bio::Seq::HandlerBaseI
265 Args : Bio::Seq::HandlerBaseI
267 =cut
269 sub seqhandler {
270 my ($self, $handler) = @_;
271 if ($handler) {
272 $self->throw("Not a Bio::HandlerBaseI") unless
273 ref($handler) && $handler->isa("Bio::HandlerBaseI");
274 $self->{'_seqhandler'} = $handler;
276 return $self->{'_seqhandler'};
281 __END__