Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / DB / BioFetch.pm
blobc36130f4b7e177a111e2ba616f1222a393238888
2 # BioPerl module for Bio::DB::BioFetch
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Lincoln Stein <lstein@cshl.org>
8 # Copyright Lincoln Stein
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
15 package Bio::DB::BioFetch;
16 use strict;
17 use HTTP::Request::Common 'POST';
19 =head1 NAME
21 Bio::DB::BioFetch - Database object interface to BioFetch retrieval
23 =head1 SYNOPSIS
25 use Bio::DB::BioFetch;
27 $bf = Bio::DB::BioFetch->new();
29 $seq = $bf->get_Seq_by_id('HSFOS'); # EMBL or SWALL ID
31 # change formats, storage procedures
32 $bf = Bio::DB::BioFetch->new(-format => 'fasta',
33 -retrievaltype => 'tempfile',
34 -db => 'EMBL');
36 $stream = $bf->get_Stream_by_id(['HSFOS','J00231']);
37 while (my $s = $stream->next_seq) {
38 print $s->seq,"\n";
40 # get a RefSeq entry
41 $bf->db('refseq');
42 eval {
43 $seq = $bf->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION
45 print "accession is ", $seq->accession_number, "\n" unless $@;
48 =head1 DESCRIPTION
50 Bio::DB::BioFetch is a guaranteed best effort sequence entry fetching
51 method. It goes to the Web-based dbfetch server located at the EBI
52 (http://www.ebi.ac.uk/Tools/dbfetch/dbfetch) to retrieve sequences in the
53 EMBL or GenBank sequence repositories.
55 This module implements all the Bio::DB::RandomAccessI interface, plus
56 the get_Stream_by_id() and get_Stream_by_acc() methods that are found
57 in the Bio::DB::SwissProt interface.
59 =head1 FEEDBACK
61 =head2 Mailing Lists
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to one
65 of the Bioperl mailing lists. Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
71 =head2 Support
73 Please direct usage questions or support issues to the mailing list:
75 I<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
82 =head2 Reporting Bugs
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 the bugs and their resolution. Bug reports can be submitted via the
86 web:
88 https://github.com/bioperl/bioperl-live/issues
90 =head1 AUTHOR - Lincoln Stein
92 Email Lincoln Stein E<lt>lstein@cshl.orgE<lt>
94 Also thanks to Heikki Lehvaslaiho E<lt>heikki-at-bioperl-dot-orgE<gt> for the
95 BioFetch server and interface specification.
97 =head1 APPENDIX
99 The rest of the documentation details each of the object
100 methods. Internal methods are usually preceded with a _
102 =cut
104 # Let the code begin...
105 use vars qw(%FORMATMAP);
106 use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
108 # warning: names used here must map into Bio::SeqIO::* space
109 use constant DEFAULT_LOCATION => 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch';
111 BEGIN {
113 %FORMATMAP = (
114 'embl' => {
115 default => 'embl', # default BioFetch format/SeqIOmodule pair
116 embl => 'embl', # alternative BioFetch format/module pair
117 fasta => 'fasta', # alternative BioFetch format/module pair
118 namespace => 'embl',
120 'swissprot' => {
121 default => 'swiss',
122 swissprot => 'swiss',
123 fasta => 'fasta',
124 namespace => 'uniprot',
126 'refseq' => {
127 default => 'genbank',
128 genbank => 'genbank',
129 fasta => 'fasta',
130 namespace => 'RefSeq',
132 'swall' => {
133 default => 'swiss',
134 swissprot => 'swiss',
135 fasta => 'fasta',
136 namespace => 'uniprot',
138 'uniprot' => {
139 default => 'swiss',
140 swissprot => 'swiss',
141 fasta => 'fasta',
142 namespace => 'uniprot',
144 'genbank' => {
145 default => 'genbank',
146 genbank => 'genbank',
147 namespace => 'genbank',
149 'genpep' => {
150 default => 'genbank',
151 genbank => 'genbank',
152 namespace => 'genpep',
154 'unisave' => {
155 default => 'swiss',
156 swissprot => 'swiss',
157 fasta => 'fasta',
158 namespace => 'unisave',
163 =head2 new
165 Title : new
166 Usage : $bf = Bio::DB::BioFetch->new(@args)
167 Function: Construct a new Bio::DB::BioFetch object
168 Returns : a Bio::DB::BioFetch object
169 Args : see below
170 Throws :
172 @args are standard -name=E<gt>value options as listed in the following
173 table. If you do not provide any options, the module assumes reasonable
174 defaults.
176 Option Value Default
177 ------ ----- -------
179 -baseaddress location of dbfetch server http://www.ebi.ac.uk/Tools/dbfetch/dbfetch
180 -retrievaltype "tempfile" or "io_string" io_string
181 -format "embl", "fasta", "swissprot", embl
182 or "genbank"
183 -db "embl", "genbank" or "swissprot" embl
185 =cut
188 sub new {
189 my ($class,@args) = @_;
190 my $self = $class->SUPER::new(@args);
191 my ($db) = $self->_rearrange([qw(DB)],@args);
192 $db ||= $self->default_db;
193 $self->db($db);
194 $self->url_base_address(DEFAULT_LOCATION) unless $self->url_base_address;
195 $self;
198 =head2 new_from_registry
200 Title : new_from_registry
201 Usage : $biofetch = $db->new_from_registry(%config)
202 Function: Creates a BioFetch object from the registry config hash
203 Returns : itself
204 Args : A configuration hash (see Registry.pm)
205 Throws :
208 =cut
210 sub new_from_registry {
211 my ($class,%config)=@_;
213 my $self = $class->SUPER::new(
214 -BASEADDRESS=>$config{'location'}
216 $self->db($config{'dbname'}) if $config{dbname};
217 return $self;
220 # from Bio::DB::RandomAccessI
222 =head2 get_Seq_by_id
224 Title : get_Seq_by_id
225 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
226 Function: Gets a Bio::Seq object by its name
227 Returns : a Bio::Seq object
228 Args : the id (as a string) of a sequence
229 Throws : "id does not exist" exception
232 =cut
234 =head2 get_Seq_by_acc
236 Title : get_Seq_by_acc
237 Usage : $seq = $db->get_Seq_by_acc('X77802');
238 Function: Gets a Bio::Seq object by accession number
239 Returns : A Bio::Seq object
240 Args : accession number (as a string)
241 Throws : "acc does not exist" exception
243 =cut
245 =head2 get_Seq_by_gi
247 Title : get_Seq_by_gi
248 Usage : $seq = $db->get_Seq_by_gi('405830');
249 Function: Gets a Bio::Seq object by gi number
250 Returns : A Bio::Seq object
251 Args : gi number (as a string)
252 Throws : "gi does not exist" exception
254 =cut
256 =head2 get_Seq_by_version
258 Title : get_Seq_by_version
259 Usage : $seq = $db->get_Seq_by_version('X77802.1');
260 Function: Gets a Bio::Seq object by sequence version
261 Returns : A Bio::Seq object
262 Args : accession.version (as a string)
263 Throws : "acc.version does not exist" exception
265 =cut
267 sub get_Seq_by_version {
268 my ($self,$seqid) = @_;
269 return $self->get_Seq_by_acc($seqid);
273 =head2 get_Stream_by_id
275 Title : get_Stream_by_id
276 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
277 Function: Gets a series of Seq objects by unique identifiers
278 Returns : a Bio::SeqIO stream object
279 Args : $ref : a reference to an array of unique identifiers for
280 the desired sequence entries
282 =cut
284 =head2 get_Stream_by_gi
286 Title : get_Stream_by_gi
287 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
288 Function: Gets a series of Seq objects by gi numbers
289 Returns : a Bio::SeqIO stream object
290 Args : $ref : a reference to an array of gi numbers for
291 the desired sequence entries
292 Note : For GenBank, this just calls the same code for get_Stream_by_id()
294 =cut
296 =head2 get_Stream_by_batch
298 Title : get_Stream_by_batch
299 Usage : $seq = $db->get_Stream_by_batch($ref);
300 Function: Get a series of Seq objects by their IDs
301 Example :
302 Returns : a Bio::SeqIO stream object
303 Args : $ref : an array reference containing a list of unique
304 ids/accession numbers.
306 In some of the Bio::DB::* moduels, get_Stream_by_id() is called
307 get_Stream_by_batch(). Since there seems to be no consensus, this
308 is provided as an alias.
310 =cut
312 *get_Stream_by_batch = \&Bio::DB::WebDBSeqI::get_Stream_by_id;
314 =head1 The remainder of these methods are for internal use
316 =head2 get_request
318 Title : get_request
319 Usage : my $url = $self->get_request
320 Function: returns a HTTP::Request object
321 Returns :
322 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
324 =cut
327 sub get_request {
328 my ($self, @qualifiers) = @_;
329 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
330 @qualifiers);
331 my $db = $self->db;
332 my $namespace = $self->_namespace;
334 $self->throw("Must specify a value for UIDs to fetch")
335 unless defined $uids;
336 my $tmp;
337 my $format_string = '';
339 $format ||= $self->default_format;
340 ($format, $tmp) = $self->request_format($format);
342 my $base = $self->url_base_address;
343 my $uid = join('+',ref $uids ? @$uids : $uids);
344 $self->debug("\n$base$format_string&id=$uid\n");
345 return POST($base,
346 [ db => $namespace,
347 id => join('+',ref $uids ? @$uids : $uids),
348 format => $format,
349 style => 'raw'
353 =head2 default_format
355 Title : default_format
356 Usage : $format = $self->default_format
357 Function: return the default format
358 Returns : a string
359 Args :
361 =cut
363 sub default_format {
364 return 'default';
367 =head2 default_db
369 Title : default_db
370 Usage : $db = $self->default_db
371 Function: return the default database
372 Returns : a string
373 Args :
375 =cut
377 sub default_db { 'embl' }
379 =head2 db
381 Title : db
382 Usage : $db = $self->db([$db])
383 Function: get/set the database
384 Returns : a string
385 Args : new database
387 =cut
389 sub db {
390 my $self = shift;
392 if (@_) {
394 my $db = lc shift;
395 my $base = $self->url_base_address;
396 $FORMATMAP{$db} or $self->throw("invalid db [$db] at [$base], must be one of [".
397 join(' ',keys %FORMATMAP). "]");
398 $self->{_db} = $db;
400 return $self->{_db} || $self->default_db ;
403 sub _namespace {
404 my $self = shift;
405 my $db = $self->db;
406 return $FORMATMAP{$db}{namespace} || $db;
409 =head2 postprocess_data
411 Title : postprocess_data
412 Usage : $self->postprocess_data ( 'type' => 'string',
413 'location' => \$datastr);
414 Function: process downloaded data before loading into a Bio::SeqIO
415 Returns : void
416 Args : hash with two keys - 'type' can be 'string' or 'file'
417 - 'location' either file location or string
418 reference containing data
420 =cut
422 sub postprocess_data {
423 my ($self,%args) = @_;
425 # check for errors in the stream
426 if ($args{'type'} eq 'string') {
427 my $stringref = $args{'location'};
428 if ($$stringref =~ /^ERROR (\d+) (.+)/m) {
429 $self->throw("BioFetch Error $1: $2");
433 elsif ($args{'type'} eq 'file') {
434 open my $F, '<', $args{'location'} or $self->throw("Could not read file '$args{location}': $!");
435 # this is dumb, but the error may be anywhere on the first three lines because the
436 # CGI headers are sometimes printed out by the server...
437 my @data = grep {defined $_} (scalar <$F>, scalar <$F>, scalar <$F>);
438 close $F;
439 if (join('',@data) =~ /^ERROR (\d+) (.+)/m) {
440 $self->throw("BioFetch Error $1: $2");
444 else {
445 $self->throw("Don't know how to postprocess data of type $args{'type'}");
450 =head2 request_format
452 Title : request_format
453 Usage : my ($req_format, $ioformat) = $self->request_format;
454 $self->request_format("genbank");
455 $self->request_format("fasta");
456 Function: Get/Set sequence format retrieval. The get-form will normally not
457 be used outside of this and derived modules.
458 Returns : Array of two strings, the first representing the format for
459 retrieval, and the second specifying the corresponding SeqIO format.
460 Args : $format = sequence format
462 =cut
464 sub request_format {
465 my ($self, $value) = @_;
466 if ( defined $value ) {
467 my $db = $self->db;
468 my $namespace = $self->_namespace;
469 my $format = lc $value;
470 print "format:", $format, " module:", $FORMATMAP{$db}->{$format}, " ($namespace)\n"
471 if $self->verbose > 0;
472 $self->throw("Invalid format [$format], must be one of [".
473 join(' ',keys %{$FORMATMAP{$db}}). "]")
474 unless $format eq 'default' || $FORMATMAP{$db}->{$format};
476 $self->{'_format'} = [ $format, $FORMATMAP{$db}->{$format}];
478 return @{$self->{'_format'}};
482 =head2 Bio::DB::WebDBSeqI methods
484 Overriding WebDBSeqI method to help newbies to retrieve sequences.
485 EMBL database is all too often passed RefSeq accessions. This
486 redirects those calls. See L<Bio::DB::RefSeq>.
489 =head2 get_Stream_by_acc
491 Title : get_Stream_by_acc
492 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
493 Function: Gets a series of Seq objects by accession numbers
494 Returns : a Bio::SeqIO stream object
495 Args : $ref : a reference to an array of accession numbers for
496 the desired sequence entries
498 =cut
500 sub get_Stream_by_acc {
501 my ($self, $ids ) = @_;
502 $self->_check_id($ids);
503 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
507 =head2 _check_id
509 Title : _check_id
510 Usage :
511 Function: Throw on whole chromosome NCBI sequences not in sequence databases
512 and redirect RefSeq accession requests sent to EMBL.
513 Returns :
514 Args : $id(s), $string
515 Throws : if accessionn number indicates whole chromosome NCBI sequence
517 =cut
519 sub _check_id {
520 my ($self, $id) = @_;
522 # NT contigs can not be retrieved
523 $self->throw("NT_ contigs are whole chromosome files which are not part of regular ".
524 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
525 if $id =~ /NT_/;
527 # Asking for a RefSeq from EMBL/GenBank
529 if ($id =~ /N._/ && $self->db ne 'refseq') {
530 $self->warn("[$id] is not a normal sequence entry but a RefSeq entry.".
531 " Redirecting the request.\n")
532 if $self->verbose >= 0;
533 $self->db('RefSeq');