maint: remove no longer used test data files.
[bioperl-live.git] / lib / Bio / DB / NCBIHelper.pm
blobf87fe7d2b00cfcfa30294d1cd458c5d34b24352a
2 # BioPerl module for Bio::DB::NCBIHelper
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 # Interfaces with new WebDBSeqI interface
16 =head1 NAME
18 Bio::DB::NCBIHelper - A collection of routines useful for queries to
19 NCBI databases.
21 =head1 SYNOPSIS
23 # Do not use this module directly.
25 # get a Bio::DB::NCBIHelper object somehow
26 my $seqio = $db->get_Stream_by_acc(['J00522']);
27 foreach my $seq ( $seqio->next_seq ) {
28 # process seq
31 =head1 DESCRIPTION
33 Provides a single place to setup some common methods for querying NCBI
34 web databases. This module just centralizes the methods for
35 constructing a URL for querying NCBI GenBank and NCBI GenPept and the
36 common HTML stripping done in L<postprocess_data>().
38 The base NCBI query URL used is:
39 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the
46 evolution of this and other Bioperl modules. Send
47 your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation
49 is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to
68 help us keep track the bugs and their resolution.
69 Bug reports can be submitted via the web.
71 https://github.com/bioperl/bioperl-live/issues
73 =head1 AUTHOR - Jason Stajich
75 Email jason@bioperl.org
77 =head1 APPENDIX
79 The rest of the documentation details each of the
80 object methods. Internal methods are usually
81 preceded with a _
83 =cut
85 # Let the code begin...
87 package Bio::DB::NCBIHelper;
88 use strict;
90 use Bio::DB::Query::GenBank;
91 use HTTP::Request::Common;
92 use URI;
93 use Bio::Root::IO;
94 use Bio::DB::RefSeq;
95 use URI::Escape qw(uri_unescape);
97 use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
99 our $HOSTBASE = 'https://eutils.ncbi.nlm.nih.gov';
100 our $MAX_ENTRIES = 19000;
101 our $REQUEST_DELAY = 3;
102 our %CGILOCATION = (
103 'batch' => [ 'post' => '/entrez/eutils/epost.fcgi' ],
104 'query' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
105 'single' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
106 'version' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
107 'gi' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
108 'webenv' => [ 'get' => '/entrez/eutils/efetch.fcgi' ]
110 our %FORMATMAP = (
111 'gb' => 'genbank',
112 'gp' => 'genbank',
113 'fasta' => 'fasta',
114 'asn.1' => 'entrezgene',
115 'gbwithparts' => 'genbank',
117 our $DEFAULTFORMAT = 'gb';
119 =head2 new
121 Title : new
122 Usage :
123 Function: the new way to make modules a little more lightweight
124 Returns :
125 Args :
127 =cut
129 sub new {
130 my ( $class, @args ) = @_;
131 my $self = $class->SUPER::new(@args);
132 my ($seq_start, $seq_stop, $no_redirect,
133 $redirect, $complexity, $strand
135 = $self->_rearrange(
136 [ qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND) ],
137 @args
139 $seq_start && $self->seq_start($seq_start);
140 $seq_stop && $self->seq_stop($seq_stop);
141 $no_redirect && $self->no_redirect($no_redirect);
142 $redirect && $self->redirect_refseq($redirect);
143 $strand && $self->strand($strand);
145 # adjust statement to accept zero value
146 defined $complexity
147 && ( $complexity >= 0 && $complexity <= 4 )
148 && $self->complexity($complexity);
149 return $self;
153 =head2 get_params
155 Title : get_params
156 Usage : my %params = $self->get_params($mode)
157 Function: returns key,value pairs to be passed to NCBI database
158 for either 'batch' or 'single' sequence retrieval method
159 Returns : a key,value pair hash
160 Args : 'single' or 'batch' mode for retrieval
162 =cut
164 sub get_params {
165 my ($self, $mode) = @_;
166 $self->throw("subclass did not implement get_params");
169 =head2 default_format
171 Title : default_format
172 Usage : my $format = $self->default_format
173 Function: returns default sequence format for this module
174 Returns : string
175 Args : none
177 =cut
179 sub default_format {
180 return $DEFAULTFORMAT;
183 =head2 get_request
185 Title : get_request
186 Usage : my $url = $self->get_request
187 Function: HTTP::Request
188 Returns :
189 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
191 =cut
193 sub get_request {
194 my ( $self, @qualifiers ) = @_;
195 my ( $mode, $uids, $format, $query, $seq_start, $seq_stop, $strand,
196 $complexity )
197 = $self->_rearrange(
198 [qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
199 @qualifiers );
200 $mode = lc $mode;
201 ($format) = $self->request_format() unless ( defined $format );
202 if ( !defined $mode || $mode eq '' ) { $mode = 'single'; }
203 my %params = $self->get_params($mode);
204 if ( !%params ) {
205 $self->throw(
206 "must specify a valid retrieval mode 'single' or 'batch' not '$mode'"
209 my $url = URI->new( $HOSTBASE . $CGILOCATION{$mode}[1] );
210 unless ( $mode eq 'webenv' || defined $uids || defined $query ) {
211 $self->throw("Must specify a query or list of uids to fetch");
213 if ( $query && $query->can('cookie') ) {
214 @params{ 'WebEnv', 'query_key' } = $query->cookie;
215 $params{'db'} = $query->db;
217 elsif ($query) {
218 $params{'id'} = join ',', $query->ids;
221 # for batch retrieval, non-query style
222 elsif ( $mode eq 'webenv' && $self->can('cookie') ) {
223 @params{ 'WebEnv', 'query_key' } = $self->cookie;
225 elsif ($uids) {
226 if ( ref($uids) =~ /array/i ) {
227 $uids = join( ",", @$uids );
229 $params{'id'} = $uids;
231 $seq_start && ( $params{'seq_start'} = $seq_start );
232 $seq_stop && ( $params{'seq_stop'} = $seq_stop );
233 $strand && ( $params{'strand'} = $strand );
234 if ( defined $complexity && ( $seq_start || $seq_stop || $strand ) ) {
235 $self->warn(
236 "Complexity set to $complexity; seq_start and seq_stop may not work!"
237 ) if ( $complexity != 1 && ( $seq_start || $seq_stop ) );
238 $self->warn(
239 "Complexity set to 0; expect strange results with strand set to 2"
240 ) if ( $complexity == 0 && $strand == 2 && $format eq 'fasta' );
242 defined $complexity && ( $params{'complexity'} = $complexity );
243 $params{'rettype'} = $format unless $mode eq 'batch';
245 # for now, 'post' is batch retrieval
246 if ( $CGILOCATION{$mode}[0] eq 'post' ) {
247 my $response = $self->ua->request( POST $url, [%params] );
248 $response->proxy_authorization_basic( $self->authentication )
249 if ( $self->authentication );
250 $self->_parse_response( $response->content );
251 my ( $cookie, $querykey ) = $self->cookie;
252 my %qualifiers = (
253 '-mode' => 'webenv',
254 '-seq_start' => $seq_start,
255 '-seq_stop' => $seq_stop,
256 '-strand' => $strand,
257 '-complexity' => $complexity,
258 '-format' => $format
260 return $self->get_request(%qualifiers);
262 else {
263 $url->query_form(%params);
264 return GET $url;
269 =head2 get_Stream_by_batch
271 Title : get_Stream_by_batch
272 Usage : $seq = $db->get_Stream_by_batch($ref);
273 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
274 at a time. For large numbers of sequences, this is far superior
275 than get_Stream_by_id or get_Stream_by_acc.
276 Example :
277 Returns : a Bio::SeqIO stream object
278 Args : $ref : either an array reference, a filename, or a filehandle
279 from which to get the list of unique ids/accession numbers.
281 NOTE: deprecated API. Use get_Stream_by_id() instead.
283 =cut
285 *get_Stream_by_batch = sub {
286 my $self = shift;
287 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
288 $self->get_Stream_by_id(@_)
291 =head2 get_Stream_by_query
293 Title : get_Stream_by_query
294 Usage : $seq = $db->get_Stream_by_query($query);
295 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
296 at a time. For large numbers of sequences, this is far superior
297 to get_Stream_by_id and get_Stream_by_acc.
298 Example :
299 Returns : a Bio::SeqIO stream object
300 Args : An Entrez query string or a Bio::DB::Query::GenBank object.
301 It is suggested that you create a Bio::DB::Query::GenBank object and get
302 the entry count before you fetch a potentially large stream.
304 =cut
306 sub get_Stream_by_query {
307 my ($self, $query) = @_;
308 unless (ref $query && $query->can('query')) {
309 $query = Bio::DB::Query::GenBank->new($query);
311 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
314 =head2 postprocess_data
316 Title : postprocess_data
317 Usage : $self->postprocess_data ( 'type' => 'string',
318 'location' => \$datastr );
319 Function: Process downloaded data before loading into a Bio::SeqIO. This
320 works for Genbank and Genpept, other classes should override
321 it with their own method.
322 Returns : void
323 Args : hash with two keys:
325 'type' can be 'string' or 'file'
326 'location' either file location or string reference containing data
328 =cut
330 sub postprocess_data {
331 # retain this in case postprocessing is needed at a future date
335 =head2 request_format
337 Title : request_format
338 Usage : my ($req_format, $ioformat) = $self->request_format;
339 $self->request_format("genbank");
340 $self->request_format("fasta");
341 Function: Get/Set sequence format retrieval. The get-form will normally not
342 be used outside of this and derived modules.
343 Returns : Array of two strings, the first representing the format for
344 retrieval, and the second specifying the corresponding SeqIO format.
345 Args : $format = sequence format
347 =cut
349 sub request_format {
350 my ( $self, $value ) = @_;
351 if ( defined $value ) {
352 $value = lc $value;
353 if ( defined $FORMATMAP{$value} ) {
354 $self->{'_format'} = [ $value, $FORMATMAP{$value} ];
356 else {
357 # Try to fall back to a default. Alternatively, we could throw
358 # an exception
359 $self->{'_format'} = [ $value, $value ];
362 return @{ $self->{'_format'} };
366 =head2 redirect_refseq
368 Title : redirect_refseq
369 Usage : $db->redirect_refseq(1)
370 Function: simple getter/setter which redirects RefSeqs to use Bio::DB::RefSeq
371 Returns : Boolean value
372 Args : Boolean value (optional)
373 Throws : 'unparseable output exception'
374 Note : This replaces 'no_redirect' as a more straightforward flag to
375 redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface)
376 instead of retrieving the NCBI records
378 =cut
380 sub redirect_refseq {
381 my $self = shift;
382 return $self->{'_redirect_refseq'} = shift if @_;
383 return $self->{'_redirect_refseq'};
386 =head2 complexity
388 Title : complexity
389 Usage : $db->complexity(3)
390 Function: get/set complexity value
391 Returns : value from 0-4 indicating level of complexity
392 Args : value from 0-4 (optional); if unset server assumes 1
393 Throws : if arg is not an integer or falls outside of noted range above
394 Note : From efetch docs, the complexity regulates the display:
396 0 - get the whole blob
397 1 - get the bioseq for gi of interest (default in Entrez)
398 2 - get the minimal bioseq-set containing the gi of interest
399 3 - get the minimal nuc-prot containing the gi of interest
400 4 - get the minimal pub-set containing the gi of interest
402 =cut
404 sub complexity {
405 my ( $self, $comp ) = @_;
406 if ( defined $comp ) {
407 $self->throw("Complexity value must be integer between 0 and 4")
408 if $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
409 $self->{'_complexity'} = $comp;
411 return $self->{'_complexity'};
414 =head2 strand
416 Title : strand
417 Usage : $db->strand(1)
418 Function: get/set strand value
419 Returns : strand value if set
420 Args : value of 1 (plus) or 2 (minus); if unset server assumes 1
421 Throws : if arg is not an integer or is not 1 or 2
422 Note : This differs from BioPerl's use of strand: 1 = plus, -1 = minus 0 = not relevant.
423 We should probably add in some functionality to convert over in the future.
425 =cut
427 sub strand {
428 my ($self, $str) = @_;
429 if ($str) {
430 $self->throw("strand() must be integer value of 1 (plus strand) or 2 (minus strand) if set") if
431 $str !~ /^\d+$/ || $str < 1 || $str > 2;
432 $self->{'_strand'} = $str;
434 return $self->{'_strand'};
437 =head2 seq_start
439 Title : seq_start
440 Usage : $db->seq_start(123)
441 Function: get/set sequence start location
442 Returns : sequence start value if set
443 Args : integer; if unset server assumes 1
444 Throws : if arg is not an integer
446 =cut
448 sub seq_start {
449 my ($self, $start) = @_;
450 if ($start) {
451 $self->throw("seq_start() must be integer value if set") if
452 $start !~ /^\d+$/;
453 $self->{'_seq_start'} = $start;
455 return $self->{'_seq_start'};
458 =head2 seq_stop
460 Title : seq_stop
461 Usage : $db->seq_stop(456)
462 Function: get/set sequence stop (end) location
463 Returns : sequence stop (end) value if set
464 Args : integer; if unset server assumes 1
465 Throws : if arg is not an integer
467 =cut
469 sub seq_stop {
470 my ($self, $stop) = @_;
471 if ($stop) {
472 $self->throw("seq_stop() must be integer if set") if
473 $stop !~ /^\d+$/;
474 $self->{'_seq_stop'} = $stop;
476 return $self->{'_seq_stop'};
479 =head2 Bio::DB::WebDBSeqI methods
481 Overriding WebDBSeqI method to help newbies to retrieve sequences
483 =head2 get_Stream_by_acc
485 Title : get_Stream_by_acc
486 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
487 Function: gets a series of Seq objects by accession numbers
488 Returns : a Bio::SeqIO stream object
489 Args : $ref : a reference to an array of accession numbers for
490 the desired sequence entries
491 Note : For GenBank, this just calls the same code for get_Stream_by_id()
493 =cut
495 sub get_Stream_by_acc {
496 my ( $self, $ids ) = @_;
497 my $newdb = $self->_check_id($ids);
498 if ( defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq') ) {
499 return $newdb->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
501 else {
502 return $self->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
506 =head2 _check_id
508 Title : _check_id
509 Usage :
510 Function:
511 Returns : a Bio::DB::RefSeq reference or throws
512 Args : $id(s), $string
514 =cut
516 sub _check_id {
517 my ( $self, $ids ) = @_;
519 # NT contigs can not be retrieved
520 $self->throw("NT_ contigs are whole chromosome files which are not part of regular"
521 . "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
522 if $ids =~ /NT_/;
524 # Asking for a RefSeq from EMBL/GenBank
525 if ( $self->redirect_refseq ) {
526 if ( $ids =~ /N._/ ) {
527 $self->warn(
528 "[$ids] is not a normal sequence database but a RefSeq entry."
529 . " Redirecting the request.\n" )
530 if $self->verbose >= 0;
531 return Bio::DB::RefSeq->new();
537 =head2 delay_policy
539 Title : delay_policy
540 Usage : $secs = $self->delay_policy
541 Function: NCBI requests a delay of 3 seconds between requests. This method
542 implements that policy.
543 Returns : number of seconds to delay
544 Args : none
546 =cut
548 sub delay_policy {
549 my $self = shift;
550 return $REQUEST_DELAY;
553 =head2 cookie
555 Title : cookie
556 Usage : ($cookie,$querynum) = $db->cookie
557 Function: return the NCBI query cookie, this information is used by
558 Bio::DB::GenBank in conjunction with efetch, ripped from
559 Bio::DB::Query::GenBank
560 Returns : list of (cookie,querynum)
561 Args : none
563 =cut
565 sub cookie {
566 my $self = shift;
567 if (@_) {
568 $self->{'_cookie'} = shift;
569 $self->{'_querynum'} = shift;
571 else {
572 return @{$self}{qw(_cookie _querynum)};
576 =head2 _parse_response
578 Title : _parse_response
579 Usage : $db->_parse_response($content)
580 Function: parse out response for cookie, this is a trimmed-down version
581 of _parse_response from Bio::DB::Query::GenBank
582 Returns : empty
583 Args : none
584 Throws : 'unparseable output exception'
586 =cut
588 sub _parse_response {
589 my $self = shift;
590 my $content = shift;
591 if ( my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s ) {
592 $self->warn("Warning(s) from GenBank: $warning\n");
594 if ( my ($error) = $content =~ /<OutputMessage>([^<]+)/ ) {
595 $self->throw("Error from Genbank: $error");
597 my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
598 my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
599 $self->cookie( uri_unescape($cookie), $querykey );
602 =head2 no_redirect
604 Title : no_redirect
605 Usage : $db->no_redirect($content)
606 Function: DEPRECATED - Used to indicate that Bio::DB::GenBank instance retrieves
607 possible RefSeqs from EBI instead; default behavior is now to
608 retrieve directly from NCBI
609 Returns : None
610 Args : None
611 Throws : Method is deprecated in favor of positive flag method 'redirect_refseq'
613 =cut
615 sub no_redirect {
616 shift->throw(
617 "Use of no_redirect() is deprecated. Bio::DB::GenBank default is to always\n".
618 "retrieve from NCBI. In order to redirect possible RefSeqs to EBI, set\n".
619 "redirect_refseq flag to 1");
624 __END__