a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / Entrez.pm
blob53aaecd20cc79f846f7c428c9c28b3cc194ae564
1 =head1 NAME
3 CXGN::Tools::Entrez
5 =head1 SYNOPSIS
7 A module using eFetch to access information about NCBI identifiers
9 =head1 USAGE
11 #Terminal Script, useful for testing:
12 use CXGN::Tools::Entrez;
13 my $eutil = CXGN::Tools::Entrez->new();
14 $eutil->run_terminal();
17 # Fetch stuff within a script!
18 use CXGN::Tools::Entrez;
19 #....script....
20 my $eutil = CXGN::Tools::Entrez->new({
21 query => "zanzibar",
22 db => "pubmed",
23 format => "abstract",
24 fetch_size => 1 });
26 $eutil->init();
27 my $first_abstract = $eutil->next();
31 my $eutil = CXGN::Tools::Entrez->new();
32 my $seq = $eutil->get_sequence("NP_188752", "Protein");
35 =head1 Author
37 C. Carpita <ccarpita@gmail.com>
39 =cut
41 package CXGN::Tools::Entrez;
43 use Class::MethodMaker
45 scalar => [qw/ db query format ret_mode
46 esearch result
47 fetch_start fetch_size fetch_max count
48 query_key web_env
49 url_base
50 xml_root/]
53 use LWP::Simple;
55 use constant DEBUG => $ENV{ENTREZ_DEBUG};
57 BEGIN {
58 print STDERR "\nDEBUG MODE\n" if DEBUG;
61 #Class Methods
62 sub new {
63 my $class = shift;
64 my $self = bless {}, $class;
65 my $args = shift;
66 if($args && (ref($args) ne "HASH")){
67 die "\nArguments must be sent as a hash reference: ...->new( { db => 'Pubmed', ... } )\n";
69 $self->url_base("http://www.ncbi.nlm.nih.gov/entrez/eutils");
70 $self->fetch_start(0);
71 $self->fetch_size(10);
72 $self->fetch_max(100);
73 $self->ret_mode("text");
75 #Default (usually JSON string)
76 #This should be set in subclass or by user
77 $self->format("");
79 while(my($k, $v) = each %$args){
80 unless (__PACKAGE__->can($k)){
81 die "\nSetting '$k' not recognized\n";
83 $self->$k($v);
85 $self->{queried} = 0;
86 return $self;
89 #Instance Methods
90 sub init {
91 my $self = shift;
93 $self->esearch($self->url_base() . "/esearch.fcgi?" .
94 "db=" . $self->db() .
95 "&retmax=" . $self->fetch_max() .
96 "&usehistory=y" .
97 "&term=" . $self->query());
99 print "\nEsearch: " . $self->esearch() if DEBUG;
100 my $result = LWP::Simple::get($self->esearch);
102 my ($count, $query_key, $web_env) = $result =~
103 /<Count>(\d+)<\/Count>.*<QueryKey>(\d+)<\/QueryKey>.*<WebEnv>(\S+)<\/WebEnv>/s;
105 print STDERR "\nQuery result size: $count" if DEBUG;
106 print STDERR "\n$result\n\n" if DEBUG;
107 $self->count($count);
108 $self->query_key($query_key);
109 $self->web_env($web_env);
110 $self->{queried} = 1;
113 sub next {
114 my $self = shift;
115 my $fetch_size = shift;
116 $fetch_size ||= $self->fetch_size();
118 my $fetch_start = shift;
119 my $no_increment = 0;
120 (defined $fetch_start)?($no_increment = 1):($fetch_start = $self->fetch_start());
122 if($fetch_start >= $self->count()){
123 print STDERR "\nProvided fetch start exceeds result size";
124 return;
127 my $efetch = $self->url_base() . "/efetch.fcgi?" .
128 "rettype=" . $self->format() .
129 "&retmode=text" .
130 "&retstart=" . $fetch_start .
131 "&retmax=" . $fetch_size .
132 "&db=" . $self->db() .
133 "&query_key=" . $self->query_key() .
134 "&WebEnv=" . $self->web_env();
136 my $result = LWP::Simple::get($efetch);
137 print STDERR "\nNo result from fetch" unless($result);
139 #Increment internal counter unless starting point was specified
140 $self->fetch_start($self->fetch_start() + $fetch_size) unless $no_increment;
142 return $result;
145 sub fetch {
146 my $self = shift;
147 my $query = shift;
148 return unless $query;
149 $self->query($query);
150 $self->init();
152 my $fetch_size = shift;
153 $fetch_size ||= $self->fetch_max();
155 return $self->next($fetch_size);
158 sub get_sequence {
159 my $self = shift;
160 my $id = shift;
161 my $db = shift;
162 $db ||= $self->db();
163 die "\nDatabase (2nd arg) not specified" unless $db;
164 if($db && !($db =~ /(protein)|(nucleotide)/i)){
165 die "\nSecond argument (database) must be 'protein' or 'nucleotide'";
167 $self->db($db);
168 $self->format("fasta");
169 my $result = $self->fetch($id, 1);
170 $result =~ s/>.*?\n//s;
171 return $result;
174 sub run_terminal {
175 my $self = shift;
176 $self->ask_for_input();
177 print "\nRunning query...\n";
178 $self->init();
179 $self->terminal_fetch();
182 sub ask_for_input {
183 my $self = shift;
184 $self->db(ask_user("Database", "Protein"));
185 $self->query(ask_user("Query", "Cytochrome P450"));
186 $self->format(ask_user("Format", "Fasta"));
189 sub terminal_fetch {
190 my $self = shift;
191 while(my $result = $self->next()){
192 print $result;
193 my $first = $self->fetch_start() - $self->fetch_size() + 1;
194 my $last = $first + $self->fetch_size() - 1;
195 $last = $self->count() if $last > $self->count();
196 print "\nResults $first - $last out of " . $self->count() . "\n";
197 my $press_msg = "Press <return> to fetch next " . $self->fetch_size() . " results...";
198 print "\n" . ("=" x (length($press_msg))) . "\n";
199 print "$press_msg\n";
200 <STDIN>;
201 print "\nFetching...\n";
205 #Utility Methods
206 sub ask_user {
207 print "$_[0] [$_[1]]: ";
208 my $rc = <>;
209 chomp $rc;
210 if($rc eq "") { $rc = $_[1]; }
211 return $rc;