2 # BioPerl module for Bio::Tools::Run::RemoteBlast
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # FORMERLY Cared for by Jason Stajich, Mat Wiepert
8 # Somewhat cared for by Roger Hall, Chris Fields (when they have time)
10 # Copyright Jason Stajich, Bioperl
12 # You may distribute this module under the same terms as perl itself
14 # POD documentation - main docs before the code
18 Bio::Tools::Run::RemoteBlast - Object for remote execution of the NCBI Blast
23 #Remote-blast "factory object" creation and blast-parameter initialization
25 use Bio::Tools::Run::RemoteBlast;
31 my @params = ( '-prog' => $prog,
34 '-readmethod' => 'SearchIO' );
36 my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
38 #change a query parameter
39 $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]';
41 #change a retrieval parameter
42 $Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'DESCRIPTIONS'} = 1000;
45 delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
47 #$v is just to turn on and off the messages
50 my $str = Bio::SeqIO->new(-file=>'amino.fa' , -format => 'fasta' );
52 #optional: send BLAST request to a cloud service provider instead of NCBI
53 #$factory->set_url_base("http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi");
55 while (my $input = $str->next_seq()){
56 #Blast a sequence against a database:
58 #Alternatively, you could pass in a file with many
59 #sequences rather than loop through sequence one at a time
60 #Remove the loop starting 'while (my $input = $str->next_seq())'
61 #and swap the two lines below for an example of that.
62 my $r = $factory->submit_blast($input);
63 #my $r = $factory->submit_blast('amino.fa');
65 print STDERR "waiting..." if( $v > 0 );
66 while ( my @rids = $factory->each_rid ) {
67 foreach my $rid ( @rids ) {
68 my $rc = $factory->retrieve_blast($rid);
71 $factory->remove_rid($rid);
73 print STDERR "." if ( $v > 0 );
76 my $result = $rc->next_result();
78 my $filename = $result->query_name()."\.out";
79 $factory->save_output($filename);
80 $factory->remove_rid($rid);
81 print "\nQuery Name: ", $result->query_name(), "\n";
82 while ( my $hit = $result->next_hit ) {
83 next unless ( $v > 0);
84 print "\thit name is ", $hit->name, "\n";
85 while( my $hsp = $hit->next_hsp ) {
86 print "\t\tscore is ", $hsp->score, "\n";
94 # This example shows how to change a CGI parameter:
95 $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM45';
96 $Bio::Tools::Run::RemoteBlast::HEADER{'GAPCOSTS'} = '15 2';
98 # And this is how to delete a CGI parameter:
99 delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
104 Class for remote execution of the NCBI Blast via HTTP.
106 For a description of the many CGI parameters see:
107 https://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html
109 Various additional options and input formats are available.
115 User feedback is an integral part of the evolution of this and other
116 Bioperl modules. Send your comments and suggestions preferably to one
117 of the Bioperl mailing lists. Your participation is much appreciated.
119 bioperl-l@bioperl.org - General discussion
120 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
124 Please direct usage questions or support issues to the mailing list:
126 I<bioperl-l@bioperl.org>
128 rather than to the module maintainer directly. Many experienced and
129 reponsive experts will be able look at the problem and quickly
130 address it. Please include a thorough description of the problem
131 with code and data examples if at all possible.
133 =head2 Reporting Bugs
135 Report bugs to the Bioperl bug tracking system to help us keep track
136 the bugs and their resolution. Bug reports can be submitted via the
139 https://github.com/bioperl/bioperl-live/issues
143 Please do NOT contact Jason directly about this module. Please post to
144 the bioperl mailing list (L<FEEDBACK>). If you would like to be the
145 official maintainer of this module, please volunteer on the list and
146 we will make it official in this POD.
148 First written by Jason Stajich, many others have helped keep it running.
152 The rest of the documentation details each of the object
153 methods. Internal methods are usually preceded with a _
157 package Bio
::Tools
::Run
::RemoteBlast
;
166 use HTTP
::Request
::Common
;
167 use Bio
::Root
::Version
;
177 # Bio::Root::IO is-a Bio::Root::Roo
178 use base
qw(Bio::Root::IO Exporter);
180 our @EXPORT = qw( NOT_FINISHED ERR_QBSTATUS ERR_NOCONTENT ERR_HTTPFAIL ERR_QBNONSPEC );
181 our $MODVERSION = $Bio::Root
::Version
::VERSION
;
182 our $URLBASE = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi';
184 # In GET/PUTPARAMS the values are regexes which validate the input.
186 'AUTO_FORMAT' => '(Off|(Semi|Full)auto)', # Off, Semiauto, Fullauto
187 'COMPOSITION_BASED_STATISTICS' => '(0|1)', # yes, no on NCBI's site, but actually binary 0/1
189 'DB_GENETIC_CODE' => '([1-9]|1[1-6]|2(1|2))', # 1..16,21,22
190 'DISPLAY_SORT' => '\d',
191 'ENDPOINTS' => '(yes|no)', # yes,no
192 'ENTREZ_QUERY' => '.*',
193 'EXPECT' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double
194 'FILTER' => '[LRm]', # L or R or m
195 'GAPCOSTS' => '-?\d+(\.\d+)\s+-?\d+(\.\d+)',
196 # Two space separated float values
197 'GENETIC_CODE' => '([1-9]|1[1-6]|2(1|2))', # 1..16,21,22
198 'HITLIST_SIZE' => '\d+', # Positive integer
199 'I_THRESH' => '-?\d+(\.\d+)([eE]-\d+)?', # float
200 'LAYOUT' => '(One|Two)Windows?', # onewindow, twowindows
201 'LCASE_MASK' => '(yes|no)', # yes, no
202 'MATRIX_NAME' => '.*',
203 'NUCL_PENALTY' => '-\d+', # Negative integer
204 'NUCL_REWARD' => '-?\d+', # Integer
205 'OTHER_ADVANCED' => '.*',
206 'PERC_IDENT' => '\d\d+', # Integer, 0-99 inclusive
207 'PHI_PATTERN' => '.*',
208 'PROGRAM' => 't?blast[pnx]',
209 # tblastp, tblastn, tblastx, blastp, blastn, blastx
211 'QUERY_FILE' => '.*',
212 'QUERY_BELIEVE_DEFLINE' => '(yes|no)', # yes, no
213 'QUERY_FROM' => '\d+', # Positive integer
214 'QUERY_TO' => '\d+', # Positive integer
215 'SEARCHSP_EFF' => '\d+', # Positive integer
216 'SERVICE' => '(plain|p[sh]i|(rps|mega)blast)',
217 # plain,psi,phi,rpsblast,megablast
218 'SHORT_QUERY_ADJUST' => '(true|false)',
219 'THRESHOLD' => '-?\d+', # Integer
220 'UNGAPPED_ALIGNMENT' => '(yes|no)', # yes, no
221 'WORD_SIZE' => '\d+' # Positive integer
224 'ALIGNMENTS' => '\d+', # Positive integer
226 '(Pairwise|(Flat)?QueryAnchored(NoIdentities)?|Tabular)',
227 # Pairwise, QueryAnchored, QueryAnchoredNoIdentities,
228 # FlatQueryAnchored, FlatQueryAnchoredNoIdentities, Tabular
229 'DATABASE_SORT' => '\d',
230 'DESCRIPTIONS' => '\d+', # Positive integer
231 'ENTREZ_LINKS_NEW_WINDOW' => '(yes|no)', # yes, no
232 'EXPECT_LOW' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double
233 'EXPECT_HIGH' => '\d+(\.\d+)?([eE]-\d+)?', # Positive double
234 'FORMAT_ENTREZ_QUERY' => '',
236 '(Alignment|Neighbors|PSSM|SearchInfo|TaxBlast(Parent|MultiFrame)?)',
237 # Alignment, Neighbors, PSSM, SearchInfo
238 # TaxBlast, TaxblastParent, TaxBlastMultiFrame
239 'FORMAT_TYPE' => '((HT|X)ML|ASN\.1|Text)',
240 # HTML, Text, ASN.1, XML
241 'NCBI_GI' => '(yes|no)', # yes, no
242 'NEW_VIEW' => '(true|false)',
244 'RESULTS_FILE' => '(yes|no)', # yes, no
245 'SERVICE' => '(plain|p[sh]i|(rps|mega)blast)',
246 # plain,psi,phi,rpsblast,megablast
247 'SHOW_OVERVIEW' => '(yes|no)' # yes, no
250 # Default values go in here for PUT
253 'FORMAT_OBJECT' => 'Alignment',
254 'COMPOSITION_BASED_STATISTICS' => 'off',
258 'PROGRAM' => 'blastp',
262 # Default values go in here for GET
263 our %RETRIEVALHEADER = (
265 'ALIGNMENTS' => '50',
266 'ALIGNMENT_VIEW' => 'Pairwise',
267 'DESCRIPTIONS' => '100',
268 'FORMAT_TYPE' => 'Text',
272 my ($caller, @args) = @_;
274 my $self = $caller->SUPER::new
(@args);
275 # so that tempfiles are cleaned up
276 $self->_initialize_io();
277 my ($prog, $data, $readmethod, $url_base) =
278 $self->_rearrange([qw(PROG DATA READMETHOD URL_BASE)],
280 # Use these two parameters for backward-compatibility.
281 # Overridden by PROGRAM and DATABASE if supplied.
282 $self->submit_parameter('PROGRAM',$prog) if $prog;
283 $self->submit_parameter('DATABASE',$data) if $data;
285 $readmethod = 'SearchIO' unless defined $readmethod;
286 $self->readmethod($readmethod);
288 # Now read the rest of the parameters and set them all
290 # PUT parameters first
291 my @putValues = $self->_rearrange([keys %PUTPARAMS],@args);
293 @putNames{keys %PUTPARAMS} = @putValues;
294 foreach my $putName (keys %putNames) {
295 $self->submit_parameter($putName,$putNames{$putName});
297 # GET parameters second
298 my @getValues = $self->_rearrange([keys %GETPARAMS],@args);
300 @getNames{keys %GETPARAMS} = @getValues;
301 foreach my $getName (keys %getNames) {
302 $self->retrieve_parameter($getName,$getNames{$getName});
304 # private variable to keep track of total rids
305 $self->{'_total_rids'} = 0;
306 $url_base ||= $URLBASE; # default to regular NCBI BLAST URL
307 $self->set_url_base($url_base);
311 =head2 retrieve_parameter
313 Title : retrieve_parameter
314 Usage : my $db = $self->retrieve_parameter
315 Function: Get/Set the named parameter for the retrieve_blast operation.
317 Args : $name : name of GET parameter
318 $val : optional value to set the parameter to
322 sub retrieve_parameter
{
323 my ($self, $name, $val) = @_;
325 $self->throw($name." is not a valid GET parameter.") unless
326 exists $GETPARAMS{$name};
328 my $regex = $GETPARAMS{$name};
329 $val =~ m/^$regex$/i or
330 $self->throw("Value ".$val." for GET parameter ".$name." does not match expression ".$regex.". Rejecting.");
331 $RETRIEVALHEADER{$name} = $val;
333 return $RETRIEVALHEADER{$name};
336 =head2 submit_parameter
338 Title : submit_parameter
339 Usage : my $db = $self->submit_parameter
340 Function: Get/Set the named parameter for the submit_blast operation.
342 Args : $name : name of PUT parameter
343 $val : optional value to set the parameter to
347 sub submit_parameter
{
348 my ($self, $name, $val) = @_;
350 $self->throw($name." is not a valid PUT parameter.") unless
351 exists $PUTPARAMS{$name};
353 my $regex = $PUTPARAMS{$name};
354 $val =~ m/^$regex$/i or
355 $self->throw("Value ".$val." for PUT parameter ".$name." does not match expression ".$regex.". Rejecting.");
356 $HEADER{$name} = $val;
358 return $HEADER{$name};
364 Usage : my $header = $self->header
365 Function: Get HTTP header for blast query
379 Usage : my $readmethod = $self->readmethod
380 Function: Get/Set the method to read the blast report
382 Args : string [ blast, blasttable, xml ]
387 my ($self, $val) = @_;
389 if ($val =~ /bplite/i) {
390 $self->throw("Use of Bio::Tools::BPlite is deprecated; use Bio::SearchIO modules instead");
392 $self->{'_readmethod'} = $val;
394 return $self->{'_readmethod'};
401 Usage : my $prog = $self->program
402 Function: Get/Set the program to run. Retained for backwards-compatibility.
404 Args : string [ blastp, blastn, blastx, tblastn, tblastx ]
409 my ($self, $val) = @_;
410 return $self->submit_parameter('PROGRAM',$val);
417 Usage : my $db = $self->database
418 Function: Get/Set the database to search. Retained for backwards-compatibility.
420 Args : string [ swissprot, nr, nt, etc... ]
425 my ($self, $val) = @_;
426 return $self->submit_parameter('DATABASE',$val);
433 Usage : my $expect = $self->expect
434 Function: Get/Set the E value cutoff. Retained for backwards-compatibility.
436 Args : string [ '1e-4' ]
441 my ($self, $val) = @_;
442 return $self->submit_parameter('EXPECT',$val);
448 Usage : my $ua = $self->ua or
450 Function: Get/Set a LWP::UserAgent for use
451 Returns : reference to LWP::UserAgent Object
453 Comments: Will create a UserAgent if none has been requested before.
458 my ($self, $value) = @_;
459 if( ! defined $self->{'_ua'} ) {
460 $self->{'_ua'} = LWP
::UserAgent
->new(env_proxy
=> 1, parse_head
=> 0);
463 $self->{'_ua'}->agent("bioperl-$nm/$MODVERSION");
465 return $self->{'_ua'};
471 Usage : $httpproxy = $db->proxy('http') or
472 $db->proxy(['http','ftp'], 'http://myproxy' )
473 Function: Get/Set a proxy for use of proxy
474 Returns : a string indicating the proxy
475 Args : $protocol : an array ref of the protocol(s) to set/get
476 $proxyurl : url of the proxy to use for the specified protocol
481 my ($self,$protocol,$proxy) = @_;
482 return if ( !defined $self->ua || !defined $protocol
483 || !defined $proxy );
484 return $self->ua->proxy($protocol,$proxy);
488 my ($self, @vals) = @_;
490 $self->{'_rids'}->{$_} = $self->{'_total_rids'};
491 $self->{'_total_rids'}++;
493 return scalar keys %{$self->{'_rids'}};
497 my ($self, @vals) = @_;
499 delete $self->{'_rids'}->{$_};
501 return scalar keys %{$self->{'_rids'}};
506 # sort on key value, a little tricky...
507 my @sort_rids = sort {$self->{'_rids'}->{$a} <=> $self->{'_rids'}->{$b}} keys %{$self->{'_rids'}};
514 Usage : $self->submit_blast([$seq1,$seq2]);
515 Function: Submit blast jobs to ncbi blast queue on sequence(s)
516 Returns : Blast report object as defined by $self->readmethod
519 * array ref of sequence objects
520 * filename of file containing fasta formatted sequences
525 my ($self, $input) = @_;
526 my @seqs = $self->_load_input($input);
527 my $url_base = $self->get_url_base;
528 return 0 unless ( @seqs );
530 my %header = $self->header;
531 $header{$_} ||= $RETRIEVALHEADER{$_} foreach (keys %RETRIEVALHEADER);
532 foreach my $seq ( @seqs ) {
533 #If query has a fasta header, the output has the query line.
534 $header{'QUERY'} = ">".(defined $seq->display_id() ?
$seq->display_id() : "").
535 " ".(defined $seq->desc() ?
$seq->desc() : "")."\n".$seq->seq();
536 my $request = POST
$url_base, [%header];
537 $self->debug($request->as_string) if ( $self->verbose > 1);
538 my $response = $self->ua->request( $request);
540 if( $response->is_success ) {
541 my @subdata = split(/\n/, $response->content );
543 foreach ( @subdata ) {
544 if( /^\s+RID\s+=\s+(\S+)/ ) {
546 #$self->debug("RID: $1\n");
548 } elsif (/^\s+RTOE\s+=\s+(.*$)/) {
555 $self->warn("req was ". $request->as_string() . "\n");
556 $self->warn(join('', @subdata));
560 # should try and be a little more verbose here
561 $self->warn("req was ". $request->as_string() . "\n" .
562 $response->error_as_HTML);
569 =head2 retrieve_blast
571 Title : retrieve_blast
572 Usage : my $blastreport = $blastfactory->retrieve_blast($rid);
573 Function: Attempts to retrieve a blast report from remote blast queue
574 Returns : scalar int (constant) or Bio::SearchIO object
576 NOT_FINISHED (= 0) : 'job not finished'
578 ERR_QBSTATUS (= 1) : return line matches 'Status=ERROR'
579 ERR_NOCONTENT (= 2): HTTP request successful, but no content
581 ERR_HTTPFAIL (= 4) : HTTP request failed
582 ERR_QBNONSPEC (= 8): return line matches 'ERROR' (not status line)
583 Args : Remote Blast ID (RID)
588 my($self, $rid) = @_;
589 my $url_base = $self->get_url_base;
590 my %hdr = %RETRIEVALHEADER;
593 my $req = HTTP
::Request
->new(
594 GET
=> $url_base."?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid",
596 #$self->debug("SearchInfo request is " . $req->as_string());
597 my $response = $self->ua->request($req);
598 if( $response->is_success ) {
600 if($response->content =~ /Status=(WAITING|ERROR|FAILED|UNKNOWN|READY)/i ) {
602 if( $status eq 'ERROR' ) {
603 $self->warn("Server Error");
605 } elsif( $status eq 'FAILED' ) {
606 $self->warn("Request Failed");
610 $self->warn("Error: No status reported\n");
612 if ( $status ne 'READY' ) {
615 my ($fh,$tempfile) = $self->tempfile();
618 my $req = POST
$url_base, [%hdr];
619 $self->debug("retrieve request is " . $req->as_string());
620 my $response = $self->ua->request($req, $tempfile);
623 my $mthd = $self->readmethod;
624 $mthd = ($mthd =~ /blasttable/i) ?
'blasttable' :
625 ($mthd =~ /xml/i) ?
'blastxml' :
626 ($mthd =~ /pull/i) ?
'blast_pull' :
628 $blastobj = Bio
::SearchIO
->new(
631 ## store filename in object ##
632 $self->file($tempfile);
636 $self->warn($response->error_as_HTML);
644 Usage : my $saveoutput = $self->save_output($filename)
645 Function: Method to save the blast report
646 Returns : 1 (throws error otherwise)
647 Args : string [rid, filename]
652 my ($self, $filename) = @_;
653 if( not defined $filename ) {
654 $self->throw("Can't save blast output. You must specify a filename to save to.");
656 my $blastfile = $self->file;
657 #open temp file and output file, have to filter out some HTML
658 open my $TMP, '<', $blastfile or $self->throw("Could not read file '$blastfile': $!");
660 open my $SAVEOUT, '>', $filename or $self->throw("Could not write file '$filename': $!");
662 while (my $line = <$TMP>) {
663 next if ($line =~ /<pre>/);
665 if ( $line =~ /^(?:[T]?BLAST[NPX])\s*.+$/i
666 or $line =~ /^RPS-BLAST\s*.+$/i
667 or $line =~ /<\?xml\sversion=/
668 or $line =~ /^#\s+(?:[T]?BLAST[NPX])\s*.+$/
674 print $SAVEOUT $line;
681 my ($self, $input) = @_;
683 if( ! defined $input ) {
684 $self->throw("Calling remote blast with no input");
689 my $seqio = Bio
::SeqIO
->new(-format
=> 'fasta',
691 while( my $seq = $seqio->next_seq ) {
695 $self->throw("Input $input was not a valid filename");
697 } elsif( ref($input) =~ /ARRAY/i ) {
698 foreach ( @
$input ) {
699 if( ref($_) && $_->isa('Bio::PrimarySeqI') ) {
702 $self->warn("Trying to add a " . ref($_) .
703 " but expected a Bio::PrimarySeqI");
707 $self->throw("Did not pass in valid input -- no sequence objects found");
709 } elsif( $input->isa('Bio::PrimarySeqI') ) {
718 Usage : $self->set_url_base($url)
719 Function: Method to override the default URL to access the NCBI BLAST web service
721 Args : string (URL used for remote BLAST searches)
722 NOTE : This is highly experimental; we cannot maintain support on
723 web services other than the default NCBI BLAST web service at this
724 time. Only some URL parameters may be supported by other BLAST
731 $self->{'_urlbase'} = shift if @_;
737 Usage : my $url = $self->set_url_base
738 Function: Get the current URL for BLAST searching
739 Returns : string (URL used for remote blast searches)
746 return $self->{'_urlbase'};
752 Usage : my $url = $self->rtoe
753 Function: Retrieve the retrieval time (defined after submit_blast())
761 return $self->{rtoe
};