2 # BioPerl module for Bio::DB::WebDBSeqI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
17 Bio::DB::WebDBSeqI - Object Interface to generalize Web Databases
18 for retrieving sequences
22 # get a WebDBSeqI object somehow
23 # assuming it is a nucleotide db
24 my $seq = $db->get_Seq_by_id('ROA1_HUMAN')
28 Provides core set of functionality for connecting to a web based
29 database for retrieving sequences.
31 Users wishing to add another Web Based Sequence Dabatase will need to
32 extend this class (see L<Bio::DB::SwissProt> or L<Bio::DB::NCBIHelper> for
33 examples) and implement the get_request method which returns a
34 HTTP::Request for the specified uids (accessions, ids, etc depending
35 on what query types the database accepts).
41 User feedback is an integral part of the
42 evolution of this and other Bioperl modules. Send
43 your comments and suggestions preferably to one
44 of the Bioperl mailing lists. Your participation
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 Please direct usage questions or support issues to the mailing list:
54 I<bioperl-l@bioperl.org>
56 rather than to the module maintainer directly. Many experienced and
57 reponsive experts will be able look at the problem and quickly
58 address it. Please include a thorough description of the problem
59 with code and data examples if at all possible.
63 Report bugs to the Bioperl bug tracking system to
64 help us keep track the bugs and their resolution.
65 Bug reports can be submitted via the web.
67 https://github.com/bioperl/bioperl-live/issues
69 =head1 AUTHOR - Jason Stajich
71 Email E<lt> jason@bioperl.org E<gt>
75 The rest of the documentation details each of the
76 object methods. Internal methods are usually
81 # Let the code begin...
83 package Bio
::DB
::WebDBSeqI
;
89 use vars
qw($MODVERSION %RETRIEVAL_TYPES $DEFAULT_RETRIEVAL_TYPE
90 $DEFAULTFORMAT $LAST_INVOCATION_TIME @ATTRIBUTES);
96 use HTTP::Request::Common;
103 use base qw(Bio::DB::RandomAccessI);
107 %RETRIEVAL_TYPES = ('io_string' => 1,
111 $DEFAULT_RETRIEVAL_TYPE = 'pipeline';
112 $DEFAULTFORMAT = 'fasta';
113 $LAST_INVOCATION_TIME = 0;
117 my ($class, @args) = @_;
118 my $self = $class->SUPER::new
(@args);
119 my ($baseaddress, $params, $ret_type, $format,$delay,$db) =
120 $self->_rearrange([qw(BASEADDRESS PARAMS RETRIEVALTYPE FORMAT DELAY DB)],
123 $ret_type = $DEFAULT_RETRIEVAL_TYPE unless ( $ret_type);
124 $baseaddress && $self->url_base_address($baseaddress);
125 $params && $self->url_params($params);
126 $db && $self->db($db);
127 $ret_type && $self->retrieval_type($ret_type);
128 $delay = $self->delay_policy unless defined $delay;
129 $self->delay($delay);
132 # insure we always have a default format set for retrieval
133 # even though this will be immedietly overwritten by most sub classes
134 $format = $self->default_format unless ( defined $format &&
137 $self->request_format($format);
138 my $ua = LWP
::UserAgent
->new(env_proxy
=> 1);
139 $ua->agent(ref($self) ."/$MODVERSION");
141 $self->{'_authentication'} = [];
145 # from Bio::DB::RandomAccessI
149 Title : get_Seq_by_id
150 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
151 Function: Gets a Bio::Seq object by its name
152 Returns : a Bio::Seq object
153 Args : the id (as a string) of a sequence
154 Throws : "id does not exist" exception
160 my ($self,$seqid) = @_;
162 my $seqio = $self->get_Stream_by_id([$seqid]);
163 $self->throw("id does not exist") if( !defined $seqio ) ;
164 if ($self->can('complexity') && defined $self->complexity && $self->complexity==0) {
165 $self->warn("When complexity is set to 0, use get_Stream_by_id\n".
166 "Returning Bio::SeqIO object");
170 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; }
172 # Since $seqio will not be used anymore, explicitly close its filehandle
173 # or it will cause trouble later on cleanup
176 $self->throw("id '$seqid' does not exist") unless @seqs;
177 if( wantarray ) { return @seqs } else { return shift @seqs }
180 =head2 get_Seq_by_acc
182 Title : get_Seq_by_acc
183 Usage : $seq = $db->get_Seq_by_acc('X77802');
184 Function: Gets a Bio::Seq object by accession number
185 Returns : A Bio::Seq object
186 Args : accession number (as a string)
187 Throws : "acc does not exist" exception
192 my ($self,$seqid) = @_;
194 my $seqio = $self->get_Stream_by_acc($seqid);
195 $self->throw("acc '$seqid' does not exist") if( ! defined $seqio );
196 if ($self->can('complexity') && defined $self->complexity && $self->complexity==0) {
197 $self->warn("When complexity is set to 0, use get_Stream_by_acc\n".
198 "Returning Bio::SeqIO object");
202 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; }
203 $self->throw("acc $seqid does not exist") unless @seqs;
204 if( wantarray ) { return @seqs } else { return shift @seqs }
210 Title : get_Seq_by_gi
211 Usage : $seq = $db->get_Seq_by_gi('405830');
212 Function: Gets a Bio::Seq object by gi number
213 Returns : A Bio::Seq object
214 Args : gi number (as a string)
215 Throws : "gi does not exist" exception
220 my ($self,$seqid) = @_;
222 my $seqio = $self->get_Stream_by_gi($seqid);
223 $self->throw("gi does not exist") if( !defined $seqio );
224 if ($self->can('complexity') && defined $self->complexity && $self->complexity==0) {
225 $self->warn("When complexity is set to 0, use get_Stream_by_gi\n".
226 "Returning Bio::SeqIO object");
230 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; }
231 $self->throw("gi does not exist") unless @seqs;
232 if( wantarray ) { return @seqs } else { return shift @seqs }
235 =head2 get_Seq_by_version
237 Title : get_Seq_by_version
238 Usage : $seq = $db->get_Seq_by_version('X77802.1');
239 Function: Gets a Bio::Seq object by sequence version
240 Returns : A Bio::Seq object
241 Args : accession.version (as a string)
242 Throws : "acc.version does not exist" exception
246 sub get_Seq_by_version
{
247 my ($self,$seqid) = @_;
249 my $seqio = $self->get_Stream_by_version($seqid);
250 $self->throw("accession.version does not exist") if( !defined $seqio );
251 if ($self->can('complexity') && defined $self->complexity && $self->complexity==0) {
252 $self->warn("When complexity is set to 0, use get_Stream_by_version\n".
253 "Returning Bio::SeqIO object");
257 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; }
258 $self->throw("accession.version does not exist") unless @seqs;
259 if( wantarray ) { return @seqs } else { return shift @seqs }
262 # implementing class must define these
267 Usage : my $url = $self->get_request
268 Function: returns a HTTP::Request object
270 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
276 my $msg = "Implementing class must define method get_request in class WebDBSeqI";
282 =head2 get_Stream_by_id
284 Title : get_Stream_by_id
285 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
286 Function: Gets a series of Seq objects by unique identifiers
287 Returns : a Bio::SeqIO stream object
288 Args : $ref : a reference to an array of unique identifiers for
289 the desired sequence entries
294 sub get_Stream_by_id
{
295 my ($self, $ids) = @_;
296 my ($webfmt,$localfmt) = $self->request_format;
297 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single',
298 '-format' => $webfmt);
301 *get_Stream_by_batch
= sub {
303 Carp
::carp
('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
304 $self->get_Stream_by_id(@_)
308 =head2 get_Stream_by_acc
310 Title : get_Stream_by_acc
311 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
312 Function: Gets a series of Seq objects by accession numbers
313 Returns : a Bio::SeqIO stream object
314 Args : $ref : a reference to an array of accession numbers for
315 the desired sequence entries
316 Note : For GenBank, this just calls the same code for get_Stream_by_id()
320 sub get_Stream_by_acc
{
321 my ($self, $ids ) = @_;
322 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
326 =head2 get_Stream_by_gi
328 Title : get_Stream_by_gi
329 Usage : $seq = $db->get_Stream_by_gi([$gi1, $gi2]);
330 Function: Gets a series of Seq objects by gi numbers
331 Returns : a Bio::SeqIO stream object
332 Args : $ref : a reference to an array of gi numbers for
333 the desired sequence entries
334 Note : For GenBank, this just calls the same code for get_Stream_by_id()
338 sub get_Stream_by_gi
{
339 my ($self, $ids ) = @_;
340 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'gi');
343 =head2 get_Stream_by_version
345 Title : get_Stream_by_version
346 Usage : $seq = $db->get_Stream_by_version([$version1, $version2]);
347 Function: Gets a series of Seq objects by accession.versions
348 Returns : a Bio::SeqIO stream object
349 Args : $ref : a reference to an array of accession.version strings for
350 the desired sequence entries
351 Note : For GenBank, this is implemented in NCBIHelper
355 sub get_Stream_by_version
{
356 my ($self, $ids ) = @_;
357 # $self->throw("Implementing class should define this method!");
358 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'version'); # how it should work
361 =head2 get_Stream_by_query
363 Title : get_Stream_by_query
364 Usage : $stream = $db->get_Stream_by_query($query);
365 Function: Gets a series of Seq objects by way of a query string or oject
366 Returns : a Bio::SeqIO stream object
367 Args : $query : A string that uses the appropriate query language
368 for the database or a Bio::DB::QueryI object. It is suggested
369 that you create the Bio::DB::Query object first and interrogate
370 it for the entry count before you fetch a potentially large stream.
374 sub get_Stream_by_query
{
375 my ($self, $query ) = @_;
376 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
379 =head2 default_format
381 Title : default_format
382 Usage : my $format = $self->default_format
383 Function: Returns default sequence format for this module
390 return $DEFAULTFORMAT;
393 # sorry, but this is hacked in because of BioFetch problems...
396 my $d = $self->{_db
};
397 $self->{_db
} = shift if @_;
401 =head2 request_format
403 Title : request_format
404 Usage : my ($req_format, $ioformat) = $self->request_format;
405 $self->request_format("genbank");
406 $self->request_format("fasta");
407 Function: Get/Set sequence format retrieval. The get-form will normally not
408 be used outside of this and derived modules.
409 Returns : Array of two strings, the first representing the format for
410 retrieval, and the second specifying the corresponding SeqIO format.
411 Args : $format = sequence format
416 my ($self, $value) = @_;
418 if( defined $value ) {
419 $self->{'_format'} = [ $value, $value];
421 return @
{$self->{'_format'}};
424 =head2 get_seq_stream
426 Title : get_seq_stream
427 Usage : my $seqio = $self->get_seq_stream(%qualifiers)
428 Function: builds a url and queries a web db
429 Returns : a Bio::SeqIO stream capable of producing sequence
430 Args : %qualifiers = a hash qualifiers that the implementing class
431 will process to make a url suitable for web querying
436 my ($self, %qualifiers) = @_;
437 my ($rformat, $ioformat) = $self->request_format();
439 foreach my $key ( keys %qualifiers ) {
440 if( $key =~ /format/i ) {
441 $rformat = $qualifiers{$key};
445 $qualifiers{'-format'} = $rformat if( !$seen);
446 ($rformat, $ioformat) = $self->request_format($rformat);
447 # These parameters are implemented for Bio::DB::GenBank objects only
448 if($self->isa('Bio::DB::GenBank')) {
449 $self->seq_start() && ($qualifiers{'-seq_start'} = $self->seq_start());
450 $self->seq_stop() && ($qualifiers{'-seq_stop'} = $self->seq_stop());
451 $self->strand() && ($qualifiers{'-strand'} = $self->strand());
452 defined $self->complexity() && ($qualifiers{'-complexity'} = $self->complexity());
454 my $request = $self->get_request(%qualifiers);
455 $request->proxy_authorization_basic($self->authentication)
456 if ( $self->authentication);
457 $self->debug("request is ". $request->as_string(). "\n");
459 # workaround for MSWin systems
460 $self->retrieval_type('io_string') if $self->retrieval_type =~ /pipeline/ && $^O
=~ /^MSWin/;
462 if ($self->retrieval_type =~ /pipeline/) {
463 # Try to create a stream using POSIX fork-and-pipe facility.
464 # this is a *big* win when fetching thousands of sequences from
465 # a web database because we can return the first entry while
466 # transmission is still in progress.
467 # Also, no need to keep sequence in memory or in a temporary file.
468 # If this fails (Windows, MacOS 9), we fall back to non-pipelined access.
470 # fork and pipe: _stream_request()=><STREAM>
471 my ($result,$stream) = $self->_open_pipe();
473 if (defined $result) {
474 $DB::fork_TTY
= File
::Spec
->devnull; # prevents complaints from debugger
475 if (!$result) { # in child process
476 $self->_stream_request($request,$stream);
477 POSIX
::_exit
(0); #prevent END blocks from executing in this forked child
480 return Bio
::SeqIO
->new('-verbose' => $self->verbose,
481 '-format' => $ioformat,
486 $self->retrieval_type('io_string');
490 if ($self->retrieval_type =~ /temp/i) {
491 my $dir = $self->io->tempdir( CLEANUP
=> 1);
492 my ( $fh, $tmpfile) = $self->io()->tempfile( DIR
=> $dir );
494 my $resp = $self->_request($request, $tmpfile);
495 if( ! -e
$tmpfile || -z
$tmpfile || ! $resp->is_success() ) {
496 $self->throw("WebDBSeqI Error - check query sequences!\n");
498 $self->postprocess_data('type' => 'file',
499 'location' => $tmpfile);
500 # this may get reset when requesting batch mode
501 ($rformat,$ioformat) = $self->request_format();
502 if( $self->verbose > 0 ) {
503 open my $ERR, '<', $tmpfile or $self->throw("Could not read file '$tmpfile': $!");
504 while(<$ERR>) { $self->debug($_);}
508 return Bio
::SeqIO
->new('-verbose' => $self->verbose,
509 '-format' => $ioformat,
510 '-file' => $tmpfile);
513 if ($self->retrieval_type =~ /io_string/i ) {
514 my $resp = $self->_request($request);
515 my $content = $resp->content_ref;
516 $self->debug( "content is $$content\n");
517 if (!$resp->is_success() || length($$content) == 0) {
518 $self->throw("WebDBSeqI Error - check query sequences!\n");
520 ($rformat,$ioformat) = $self->request_format();
521 $self->postprocess_data('type'=> 'string',
522 'location' => $content);
523 $self->debug( "str is $$content\n");
524 return Bio
::SeqIO
->new('-verbose' => $self->verbose,
525 '-format' => $ioformat,
526 '-fh' => new IO
::String
($$content));
529 # if we got here, we don't know how to handle the retrieval type
530 $self->throw("retrieval type " . $self->retrieval_type .
534 =head2 url_base_address
536 Title : url_base_address
537 Usage : my $address = $self->url_base_address or
538 $self->url_base_address($address)
539 Function: Get/Set the base URL for the Web Database
540 Returns : Base URL for the Web Database
541 Args : $address - URL for the WebDatabase
545 sub url_base_address
{
547 my $d = $self->{'_baseaddress'};
548 $self->{'_baseaddress'} = shift if @_;
556 Usage : $httpproxy = $db->proxy('http') or
557 $db->proxy(['http','ftp'], 'http://myproxy' )
558 Function: Get/Set a proxy for use of proxy
559 Returns : a string indicating the proxy
560 Args : $protocol : an array ref of the protocol(s) to set/get
561 $proxyurl : url of the proxy to use for the specified protocol
562 $username : username (if proxy requires authentication)
563 $password : password (if proxy requires authentication)
568 my ($self,$protocol,$proxy,$username,$password) = @_;
569 return if ( !defined $self->ua || !defined $protocol
570 || !defined $proxy );
571 $self->authentication($username, $password)
572 if ($username && $password);
573 return $self->ua->proxy($protocol,$proxy);
576 =head2 authentication
578 Title : authentication
579 Usage : $db->authentication($user,$pass)
580 Function: Get/Set authentication credentials
581 Returns : Array of user/pass
582 Args : Array or user/pass
588 my ($self,$u,$p) = @_;
590 if( defined $u && defined $p ) {
591 $self->{'_authentication'} = [ $u,$p];
593 return @
{$self->{'_authentication'}};
597 =head2 retrieval_type
599 Title : retrieval_type
600 Usage : $self->retrieval_type($type);
601 my $type = $self->retrieval_type
602 Function: Get/Set a proxy for retrieval_type (pipeline, io_string or tempfile)
603 Returns : string representing retrieval type
604 Args : $value - the value to store
606 This setting affects how the data stream from the remote web server is
607 processed and passed to the Bio::SeqIO layer. Three types of retrieval
608 types are currently allowed:
610 pipeline Perform a fork in an attempt to begin streaming
611 while the data is still downloading from the remote
612 server. Disk, memory and speed efficient, but will
613 not work on Windows or MacOS 9 platforms.
615 io_string Store downloaded database entry(s) in memory. Can be
616 problematic for batch downloads because entire set
617 of entries must fit in memory. Alll entries must be
618 downloaded before processing can begin.
620 tempfile Store downloaded database entry(s) in a temporary file.
621 All entries must be downloaded before processing can
624 The default is pipeline, with automatic fallback to io_string if
625 pipelining is not available.
630 my ($self, $value) = @_;
631 if( defined $value ) {
633 if( ! $RETRIEVAL_TYPES{$value} ) {
634 $self->warn("invalid retrieval type $value must be one of (" .
635 join(",", keys %RETRIEVAL_TYPES), ")");
636 $value = $DEFAULT_RETRIEVAL_TYPE;
638 $self->{'_retrieval_type'} = $value;
640 return $self->{'_retrieval_type'};
646 Usage : my $params = $self->url_params or
647 $self->url_params($params)
648 Function: Get/Set the URL parameters for the Web Database
649 Returns : url parameters for Web Database
650 Args : $params - parameters to be appended to the URL for the WebDatabase
655 my ($self, $value) = @_;
656 if( defined $value ) {
657 $self->{'_urlparams'} = $value;
664 Usage : my $ua = $self->ua or
666 Function: Get/Set a LWP::UserAgent for use
667 Returns : reference to LWP::UserAgent Object
668 Args : $ua - must be a LWP::UserAgent
673 my ($self, $ua) = @_;
674 if( defined $ua && $ua->isa("LWP::UserAgent") ) {
675 $self->{'_ua'} = $ua;
677 return $self->{'_ua'};
680 =head2 postprocess_data
682 Title : postprocess_data
683 Usage : $self->postprocess_data ( 'type' => 'string',
684 'location' => \$datastr);
685 Function: process downloaded data before loading into a Bio::SeqIO
687 Args : hash with two keys - 'type' can be 'string' or 'file'
688 - 'location' either file location or string
689 reference containing data
693 sub postprocess_data
{
694 my ( $self, %args) = @_;
700 my ($self, $url,$tmpfile) = @_;
702 if( defined $tmpfile && $tmpfile ne '' ) {
703 $resp = $self->ua->request($url, $tmpfile);
705 $resp = $self->ua->request($url);
708 if( $resp->is_error ) {
709 $self->throw("WebDBSeqI Request Error:\n".$resp->as_string);
714 #mod_perl-safe replacement for the open(BLEH,'-|') call. if running
715 #under mod_perl, detects it and closes the child's STDIN and STDOUT
719 # is mod_perl running? Which API?
720 my $mp = $self->mod_perl_api;
721 if($mp and ! our $loaded_apache_sp) {
722 my $load_api = ($mp == 1) ?
'use Apache::SubProcess': 'use Apache2::SubProcess';
724 $@
and $self->throw("$@\n$load_api module required for running under mod_perl");
725 $loaded_apache_sp = 1;
728 my $pipe = IO
::Pipe
->new();
730 local $SIG{CHLD
} = 'IGNORE';
731 defined(my $pid = fork)
732 or $self->throw("Couldn't fork: $!");
738 #if we're running under mod_perl, clean up some things after this fork
739 if ($ENV{MOD_PERL
} and my $r = eval{Apache
->request} ) {
740 $r->cleanup_for_exec;
741 #don't read or write the mod_perl parent's tied filehandles
742 close STDIN
; close STDOUT
;
743 setsid
() or $self->throw('Could not detach from parent');
749 return ( $pid, $pipe );
752 # send web request to specified filehandle, or stdout, for streaming purposes
753 sub _stream_request
{
756 my $dest_fh = shift || \
*STDOUT
;
758 # fork so as to pipe output of fetch process through to
759 # postprocess_data method call.
760 my ($child,$fetch) = $self->_open_pipe();
764 local ($/) = "//\n"; # assume genbank/swiss format
767 while (my $record = <$fetch>) {
769 $self->postprocess_data('type' => 'string',
770 'location' => \
$record);
771 print $dest_fh $record;
773 $/ = "\n"; # reset to be safe;
774 close $dest_fh; #must explicitly close here, because the hard
775 #exits don't cloes them for us
780 my $resp = $self->ua->request($request,
781 sub { print $fetch $_[0] }
783 if( $resp->is_error ) {
784 $self->throw("WebDBSeqI Request Error:\n".$resp->as_string);
786 close $fetch; #must explicitly close here, because the hard exists
787 #don't close them for us
795 if(defined($io) || (! exists($self->{'_io'}))) {
796 $io = Bio
::Root
::IO
->new() unless $io;
797 $self->{'_io'} = $io;
799 return $self->{'_io'};
806 Usage : $secs = $self->delay([$secs])
807 Function: get/set number of seconds to delay between fetches
808 Returns : number of seconds to delay
811 NOTE: the default is to use the value specified by delay_policy().
812 This can be overridden by calling this method, or by passing the
813 -delay argument to new().
819 my $d = $self->{'_delay'};
820 $self->{'_delay'} = shift if @_;
827 Usage : $secs = $self->delay_policy
828 Function: return number of seconds to delay between calls to remote db
829 Returns : number of seconds to delay
832 NOTE: The default delay policy is 0s. Override in subclasses to
833 implement delays. The timer has only second resolution, so the delay
834 will actually be +/- 1s.
846 Usage : $self->_sleep
847 Function: sleep for a number of seconds indicated by the delay policy
851 NOTE: This method keeps track of the last time it was called and only
852 imposes a sleep if it was called more recently than the delay_policy()
859 my $last_invocation = $LAST_INVOCATION_TIME;
860 if (time - $LAST_INVOCATION_TIME < $self->delay) {
861 my $delay = $self->delay - (time - $LAST_INVOCATION_TIME);
862 warn "sleeping for $delay seconds\n" if $self->verbose > 0;
865 $LAST_INVOCATION_TIME = time;
871 Usage : $version = self->mod_perl_api
872 Function: Returns API version of mod_perl being used based on set env. variables
873 Returns : mod_perl API version; if mod_perl isn't loaded, returns 0
880 my $v = $ENV{MOD_PERL
} ?
881 ( exists $ENV{MOD_PERL_API_VERSION
} && $ENV{MOD_PERL_API_VERSION
} >= 2 ) ?