2 # BioPerl module for Bio::DB::Query::GenBank.pm
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Lincoln Stein <lstein@cshl.org>
8 # Copyright Lincoln Stein
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
17 Bio::DB::Query::GenBank - Build a GenBank Entrez Query
21 use Bio::DB::Query::GenBank;
24 my $query_string = 'Oryza[Organism] AND EST[Keyword]';
25 my $query = Bio::DB::Query::GenBank->new(-db => 'nucleotide',
26 -query => $query_string,
30 print $query->count,"\n";
32 # get a Genbank database handle
33 my $gb = Bio::DB::GenBank->new();
34 my $stream = $gb->get_Stream_by_query($query);
35 while (my $seq = $stream->next_seq) {
36 # do something with the sequence object
39 # initialize the list yourself
40 my $query = Bio::DB::Query::GenBank->new(-ids=>[195052,2981014,11127914]);
45 This class encapsulates NCBI Entrez queries. It can be used to store
46 a list of GI numbers, to translate an Entrez query expression into a
47 list of GI numbers, or to count the number of terms that would be
48 returned by a query. Once created, the query object can be passed to
49 a Bio::DB::GenBank object in order to retrieve the entries
50 corresponding to the query.
56 User feedback is an integral part of the
57 evolution of this and other Bioperl modules. Send
58 your comments and suggestions preferably to one
59 of the Bioperl mailing lists. Your participation
62 bioperl-l@bioperl.org - General discussion
63 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
67 Please direct usage questions or support issues to the mailing list:
69 I<bioperl-l@bioperl.org>
71 rather than to the module maintainer directly. Many experienced and
72 reponsive experts will be able look at the problem and quickly
73 address it. Please include a thorough description of the problem
74 with code and data examples if at all possible.
78 Report bugs to the Bioperl bug tracking system to help us keep track
79 the bugs and their resolution. Bug reports can be submitted via the
82 https://github.com/bioperl/bioperl-live/issues
84 =head1 AUTHOR - Lincoln Stein
90 The rest of the documentation details each of the
91 object methods. Internal methods are usually
96 # Let the code begin...
98 package Bio
::DB
::Query
::GenBank
;
100 use URI
::Escape
'uri_unescape';
101 use Bio
::DB
::NCBIHelper
;
104 #use constant EPOST => $Bio::DB::NCBIHelper::HOSTBASE . '/entrez/eutils/epost.fcgi';
105 #use constant ESEARCH => $Bio::DB::NCBIHelper::HOSTBASE . '/entrez/eutils/esearch.fcgi';
106 # the reference to the our variable of the $Bio::DB::NCBIHelper::HOSTBASE doesn't seem to work in
107 # the constant definition in perl 5.10.1 or 5.16.3
108 use constant EPOST
=> '/entrez/eutils/epost.fcgi';
109 use constant ESEARCH
=> '/entrez/eutils/esearch.fcgi';
110 use constant DEFAULT_DB
=> 'protein';
111 use constant MAXENTRY
=> 100;
113 use vars
qw(@ATTRIBUTES);
115 use base qw(Bio::DB::Query::WebQuery);
118 @ATTRIBUTES = qw(db reldate mindate maxdate datetype maxids);
119 for my $method (@ATTRIBUTES) {
123 my \$d = \$self->{'_$method'};
124 \$self->{'_$method'} = shift if \@_;
134 Usage : $db = Bio::DB::Query::GenBank->new(@args)
135 Function: create new query object
136 Returns : new query object
137 Args : -db database (see below for allowable values)
139 -mindate minimum date to retrieve from (YYYY/MM/DD)
140 -maxdate maximum date to retrieve from (YYYY/MM/DD)
141 -reldate relative date to retrieve from (days)
142 -datetype date field to use ('edat' or 'mdat')
143 -ids array ref of gids (overrides query)
144 -maxids the maximum number of IDs you wish to collect
147 This method creates a new query object. Typically you will specify a
148 -db and a -query argument, possibly modified by -mindate, -maxdate, or
149 -reldate. -mindate and -maxdate specify minimum and maximum dates for
150 entries you are interested in retrieving, expressed in the form
151 YYYY/MM/DD. -reldate is used to fetch entries that are more recent
152 than the indicated number of days.
154 If you provide an array reference of IDs in -ids, the query will be
155 ignored and the list of IDs will be used when the query is passed to a
156 Bio::DB::GenBank object's get_Stream_by_query() method. A variety of
157 IDs are automatically recognized, including GI numbers, Accession
158 numbers, Accession.version numbers and locus names.
160 By default, the query will collect only the first 100 IDs and will
161 generate an exception if you call the ids() method and the query
162 returned more than that number. To increase this maximum, set -maxids
163 to a number larger than the number of IDs you expect to obtain. This
164 only affects the list of IDs you obtain when you call the ids()
165 method, and does not affect in any way the number of entries you
166 receive when you generate a SeqIO stream from the query.
170 The most commonly used databases are:
179 An up to date list of database names supported by NCBI eUtils is
181 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?
183 However, note that not all of these databases return datatypes that
184 are parsable by Bio::DB::GenBank
190 my $self = $class->SUPER::new
(@_);
191 my ($query,$db,$reldate,$mindate,$maxdate,$datetype,$ids,$maxids)
192 = $self->_rearrange([qw(QUERY DB RELDATE MINDATE MAXDATE DATETYPE IDS MAXIDS)],@_);
193 $self->db($db || DEFAULT_DB
);
194 $reldate && $self->reldate($reldate);
195 $mindate && $self->mindate($mindate);
196 $maxdate && $self->maxdate($maxdate);
197 $maxids && $self->maxids($maxids);
198 $datetype ||= 'mdat';
199 $datetype && $self->datetype($datetype);
206 Usage : ($cookie,$querynum) = $db->cookie
207 Function: return the NCBI query cookie
208 Returns : list of (cookie,querynum)
211 NOTE: this information is used by Bio::DB::GenBank in
212 conjunction with efetch.
219 $self->{'_cookie'} = shift;
220 $self->{'_querynum'} = shift;
225 @
{$self}{qw(_cookie _querynum)};
229 =head2 _request_parameters
231 Title : _request_parameters
232 Usage : ($method,$base,@params = $db->_request_parameters
233 Function: return information needed to construct the request
234 Returns : list of method, url base and key=>value pairs
239 sub _request_parameters
{
242 my @params = map {eval("\$self->$_") ?
($_ => eval("\$self->$_")) : () } @ATTRIBUTES;
243 push @params,('usehistory'=>'y','tool'=>'bioperl');
246 $base = $Bio::DB
::NCBIHelper
::HOSTBASE
.ESEARCH
; # this seems to need to be dynamic
247 push @params,('term' => $self->query);
248 # Providing 'retmax' limits queries to 500 sequences ?? I don't think so LS
249 push @params,('retmax' => $self->maxids || MAXENTRY
);
251 # And actually, it seems that we need 'retstart' equal to 0 ?? I don't think so LS
252 # push @params, ('retstart' => 0);
254 ($method,$base,@params);
261 Usage : $count = $db->count;
262 Function: return count of number of entries retrieved by query
266 Returns the number of entries that are matched by the query.
273 my $d = $self->{'_count'};
274 $self->{'_count'} = shift;
279 return $self->{'_count'};
286 Usage : @ids = $db->ids([@ids])
287 Function: get/set matching ids
288 Returns : array of sequence ids
289 Args : (optional) array ref with new set of ids
296 Usage : $query = $db->query([$query])
297 Function: get/set query string
299 Args : (optional) new query string
303 =head2 _parse_response
305 Title : _parse_response
306 Usage : $db->_parse_response($content)
307 Function: parse out response
310 Throws : 'unparseable output exception'
314 sub _parse_response
{
317 if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
318 $self->warn("Warning(s) from GenBank: $warning\n");
320 if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
321 $self->throw("Error from Genbank: $error");
324 my ($count) = $content =~ /<Count>(\d+)/;
325 my ($max) = $content =~ /<RetMax>(\d+)/;
326 my $truncated = $count > $max;
327 $self->count($count);
329 my @ids = $content =~ /<Id>(\d+)/g;
332 $self->debug("ids truncated at $max\n");
334 $self->_truncated($truncated);
335 my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
336 my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
337 $self->cookie(uri_unescape
($cookie),$querykey);
340 =head2 _generate_id_string
342 Title : _generate_id_string
343 Usage : $string = $db->_generate_id_string
344 Function: joins IDs together in string (possibly implementation-dependent)
345 Returns : string of concatenated IDs
346 Args : array ref of ids (normally passed into the constructor)
350 sub _generate_id_string
{
351 my ($self, $ids) = @_;
352 # this attempts to separate out accs (alphanumeric) from UIDs (numeric only)
353 # recent changes to esearch has wrought this upon us.. cjf 4/19/07
354 return sprintf('%s',join('|',map {
355 ($_ =~ m{^\d+$}) ?
$_.'[UID]' : $_.'[PACC]'