2 # BioPerl module for Bio::DB::BioFetch
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Lincoln Stein <lstein@cshl.org>
8 # Copyright Lincoln Stein
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
15 package Bio
::DB
::BioFetch
;
17 use HTTP
::Request
::Common
'POST';
21 Bio::DB::BioFetch - Database object interface to BioFetch retrieval
25 use Bio::DB::BioFetch;
27 $bf = Bio::DB::BioFetch->new();
29 $seq = $bf->get_Seq_by_id('HSFOS'); # EMBL or SWALL ID
31 # change formats, storage procedures
32 $bf = Bio::DB::BioFetch->new(-format => 'fasta',
33 -retrievaltype => 'tempfile',
36 $stream = $bf->get_Stream_by_id(['HSFOS','J00231']);
37 while (my $s = $stream->next_seq) {
43 $seq = $bf->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION
45 print "accession is ", $seq->accession_number, "\n" unless $@;
50 Bio::DB::BioFetch is a guaranteed best effort sequence entry fetching
51 method. It goes to the Web-based dbfetch server located at the EBI
52 (http://www.ebi.ac.uk/Tools/dbfetch/dbfetch) to retrieve sequences in the
53 EMBL or GenBank sequence repositories.
55 This module implements all the Bio::DB::RandomAccessI interface, plus
56 the get_Stream_by_id() and get_Stream_by_acc() methods that are found
57 in the Bio::DB::SwissProt interface.
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to one
65 of the Bioperl mailing lists. Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 Please direct usage questions or support issues to the mailing list:
75 I<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 the bugs and their resolution. Bug reports can be submitted via the
88 https://github.com/bioperl/bioperl-live/issues
90 =head1 AUTHOR - Lincoln Stein
92 Email Lincoln Stein E<lt>lstein@cshl.orgE<lt>
94 Also thanks to Heikki Lehvaslaiho E<lt>heikki-at-bioperl-dot-orgE<gt> for the
95 BioFetch server and interface specification.
99 The rest of the documentation details each of the object
100 methods. Internal methods are usually preceded with a _
104 # Let the code begin...
105 use vars
qw(%FORMATMAP);
106 use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
108 # warning: names used here must map into Bio::SeqIO::* space
109 use constant DEFAULT_LOCATION
=> 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch';
115 default => 'embl', # default BioFetch format/SeqIOmodule pair
116 embl
=> 'embl', # alternative BioFetch format/module pair
117 fasta
=> 'fasta', # alternative BioFetch format/module pair
122 swissprot
=> 'swiss',
124 namespace
=> 'uniprot',
127 default => 'genbank',
128 genbank
=> 'genbank',
130 namespace
=> 'RefSeq',
134 swissprot
=> 'swiss',
136 namespace
=> 'uniprot',
140 swissprot
=> 'swiss',
142 namespace
=> 'uniprot',
145 default => 'genbank',
146 genbank
=> 'genbank',
147 namespace
=> 'genbank',
150 default => 'genbank',
151 genbank
=> 'genbank',
152 namespace
=> 'genpep',
156 swissprot
=> 'swiss',
158 namespace
=> 'unisave',
166 Usage : $bf = Bio::DB::BioFetch->new(@args)
167 Function: Construct a new Bio::DB::BioFetch object
168 Returns : a Bio::DB::BioFetch object
172 @args are standard -name=E<gt>value options as listed in the following
173 table. If you do not provide any options, the module assumes reasonable
179 -baseaddress location of dbfetch server http://www.ebi.ac.uk/Tools/dbfetch/dbfetch
180 -retrievaltype "tempfile" or "io_string" io_string
181 -format "embl", "fasta", "swissprot", embl
183 -db "embl", "genbank" or "swissprot" embl
189 my ($class,@args) = @_;
190 my $self = $class->SUPER::new
(@args);
191 my ($db) = $self->_rearrange([qw(DB)],@args);
192 $db ||= $self->default_db;
194 $self->url_base_address(DEFAULT_LOCATION
) unless $self->url_base_address;
198 =head2 new_from_registry
200 Title : new_from_registry
201 Usage : $biofetch = $db->new_from_registry(%config)
202 Function: Creates a BioFetch object from the registry config hash
204 Args : A configuration hash (see Registry.pm)
210 sub new_from_registry
{
211 my ($class,%config)=@_;
213 my $self = $class->SUPER::new
(
214 -BASEADDRESS
=>$config{'location'}
216 $self->db($config{'dbname'}) if $config{dbname
};
220 # from Bio::DB::RandomAccessI
224 Title : get_Seq_by_id
225 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
226 Function: Gets a Bio::Seq object by its name
227 Returns : a Bio::Seq object
228 Args : the id (as a string) of a sequence
229 Throws : "id does not exist" exception
234 =head2 get_Seq_by_acc
236 Title : get_Seq_by_acc
237 Usage : $seq = $db->get_Seq_by_acc('X77802');
238 Function: Gets a Bio::Seq object by accession number
239 Returns : A Bio::Seq object
240 Args : accession number (as a string)
241 Throws : "acc does not exist" exception
247 Title : get_Seq_by_gi
248 Usage : $seq = $db->get_Seq_by_gi('405830');
249 Function: Gets a Bio::Seq object by gi number
250 Returns : A Bio::Seq object
251 Args : gi number (as a string)
252 Throws : "gi does not exist" exception
256 =head2 get_Seq_by_version
258 Title : get_Seq_by_version
259 Usage : $seq = $db->get_Seq_by_version('X77802.1');
260 Function: Gets a Bio::Seq object by sequence version
261 Returns : A Bio::Seq object
262 Args : accession.version (as a string)
263 Throws : "acc.version does not exist" exception
267 sub get_Seq_by_version
{
268 my ($self,$seqid) = @_;
269 return $self->get_Seq_by_acc($seqid);
273 =head2 get_Stream_by_id
275 Title : get_Stream_by_id
276 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
277 Function: Gets a series of Seq objects by unique identifiers
278 Returns : a Bio::SeqIO stream object
279 Args : $ref : a reference to an array of unique identifiers for
280 the desired sequence entries
284 =head2 get_Stream_by_gi
286 Title : get_Stream_by_gi
287 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
288 Function: Gets a series of Seq objects by gi numbers
289 Returns : a Bio::SeqIO stream object
290 Args : $ref : a reference to an array of gi numbers for
291 the desired sequence entries
292 Note : For GenBank, this just calls the same code for get_Stream_by_id()
296 =head2 get_Stream_by_batch
298 Title : get_Stream_by_batch
299 Usage : $seq = $db->get_Stream_by_batch($ref);
300 Function: Get a series of Seq objects by their IDs
302 Returns : a Bio::SeqIO stream object
303 Args : $ref : an array reference containing a list of unique
304 ids/accession numbers.
306 In some of the Bio::DB::* moduels, get_Stream_by_id() is called
307 get_Stream_by_batch(). Since there seems to be no consensus, this
308 is provided as an alias.
312 *get_Stream_by_batch
= \
&Bio
::DB
::WebDBSeqI
::get_Stream_by_id
;
314 =head1 The remainder of these methods are for internal use
319 Usage : my $url = $self->get_request
320 Function: returns a HTTP::Request object
322 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
328 my ($self, @qualifiers) = @_;
329 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
332 my $namespace = $self->_namespace;
334 $self->throw("Must specify a value for UIDs to fetch")
335 unless defined $uids;
337 my $format_string = '';
339 $format ||= $self->default_format;
340 ($format, $tmp) = $self->request_format($format);
342 my $base = $self->url_base_address;
343 my $uid = join('+',ref $uids ? @
$uids : $uids);
344 $self->debug("\n$base$format_string&id=$uid\n");
347 id
=> join('+',ref $uids ? @
$uids : $uids),
353 =head2 default_format
355 Title : default_format
356 Usage : $format = $self->default_format
357 Function: return the default format
370 Usage : $db = $self->default_db
371 Function: return the default database
377 sub default_db
{ 'embl' }
382 Usage : $db = $self->db([$db])
383 Function: get/set the database
395 my $base = $self->url_base_address;
396 $FORMATMAP{$db} or $self->throw("invalid db [$db] at [$base], must be one of [".
397 join(' ',keys %FORMATMAP). "]");
400 return $self->{_db
} || $self->default_db ;
406 return $FORMATMAP{$db}{namespace
} || $db;
409 =head2 postprocess_data
411 Title : postprocess_data
412 Usage : $self->postprocess_data ( 'type' => 'string',
413 'location' => \$datastr);
414 Function: process downloaded data before loading into a Bio::SeqIO
416 Args : hash with two keys - 'type' can be 'string' or 'file'
417 - 'location' either file location or string
418 reference containing data
422 sub postprocess_data
{
423 my ($self,%args) = @_;
425 # check for errors in the stream
426 if ($args{'type'} eq 'string') {
427 my $stringref = $args{'location'};
428 if ($$stringref =~ /^ERROR (\d+) (.+)/m) {
429 $self->throw("BioFetch Error $1: $2");
433 elsif ($args{'type'} eq 'file') {
434 open my $F, '<', $args{'location'} or $self->throw("Could not read file '$args{location}': $!");
435 # this is dumb, but the error may be anywhere on the first three lines because the
436 # CGI headers are sometimes printed out by the server...
437 my @data = grep {defined $_} (scalar <$F>, scalar <$F>, scalar <$F>);
439 if (join('',@data) =~ /^ERROR (\d+) (.+)/m) {
440 $self->throw("BioFetch Error $1: $2");
445 $self->throw("Don't know how to postprocess data of type $args{'type'}");
450 =head2 request_format
452 Title : request_format
453 Usage : my ($req_format, $ioformat) = $self->request_format;
454 $self->request_format("genbank");
455 $self->request_format("fasta");
456 Function: Get/Set sequence format retrieval. The get-form will normally not
457 be used outside of this and derived modules.
458 Returns : Array of two strings, the first representing the format for
459 retrieval, and the second specifying the corresponding SeqIO format.
460 Args : $format = sequence format
465 my ($self, $value) = @_;
466 if ( defined $value ) {
468 my $namespace = $self->_namespace;
469 my $format = lc $value;
470 print "format:", $format, " module:", $FORMATMAP{$db}->{$format}, " ($namespace)\n"
471 if $self->verbose > 0;
472 $self->throw("Invalid format [$format], must be one of [".
473 join(' ',keys %{$FORMATMAP{$db}}). "]")
474 unless $format eq 'default' || $FORMATMAP{$db}->{$format};
476 $self->{'_format'} = [ $format, $FORMATMAP{$db}->{$format}];
478 return @
{$self->{'_format'}};
482 =head2 Bio::DB::WebDBSeqI methods
484 Overriding WebDBSeqI method to help newbies to retrieve sequences.
485 EMBL database is all too often passed RefSeq accessions. This
486 redirects those calls. See L<Bio::DB::RefSeq>.
489 =head2 get_Stream_by_acc
491 Title : get_Stream_by_acc
492 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
493 Function: Gets a series of Seq objects by accession numbers
494 Returns : a Bio::SeqIO stream object
495 Args : $ref : a reference to an array of accession numbers for
496 the desired sequence entries
500 sub get_Stream_by_acc
{
501 my ($self, $ids ) = @_;
502 $self->_check_id($ids);
503 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
511 Function: Throw on whole chromosome NCBI sequences not in sequence databases
512 and redirect RefSeq accession requests sent to EMBL.
514 Args : $id(s), $string
515 Throws : if accessionn number indicates whole chromosome NCBI sequence
520 my ($self, $id) = @_;
522 # NT contigs can not be retrieved
523 $self->throw("NT_ contigs are whole chromosome files which are not part of regular ".
524 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
527 # Asking for a RefSeq from EMBL/GenBank
529 if ($id =~ /N._/ && $self->db ne 'refseq') {
530 $self->warn("[$id] is not a normal sequence entry but a RefSeq entry.".
531 " Redirecting the request.\n")
532 if $self->verbose >= 0;