a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / FeatureFetch.pm
blob00d472da2a8fa93929b0a2e14505dc2ea7868028
1 package CXGN::Tools::FeatureFetch;
2 use strict;
4 use XML::Twig;
6 use CXGN::Chado::Feature;
8 use CXGN::Tools::Entrez;
9 use CXGN::Debug;
12 =head1 CXGN::Tools::FeatureFetch
14 get data from the NucleotideCore site and parse the necessary fields to fill a feature object
17 =head2
20 =head1 Authors
22 Tim Jacobs
23 Naama Menda (nm249@cornell.edu)
25 =cut
27 =head2 new
29 Usage: my $feature_fetch = CXGN::Tools::FeatureFetch->new($feature_obj);
30 Desc:
31 Ret:
32 Args: $feature_object
33 Side Effects:
34 Example:
36 =cut
38 our $feature_object=undef;
39 our %tree=(); # (name =>$tree_name , organism=>$tree_organism, taxon_id=>$taxon_id) ;
41 sub new {
42 my $class = shift;
43 $feature_object= shift;
45 my $args = {};
46 my $self = bless $args, $class;
48 $self->set_feature_object($feature_object);
50 my $GBaccession= $feature_object->get_name();
51 if ($GBaccession) {
52 $self->fetch($GBaccession);
54 return $self;
57 =head2 fetch
59 Usage: CXGN::Tools::featureFetch->fetch($genBank_accession);
60 Desc:
61 Ret:
62 Args: $genBank_accession
63 Side Effects:
64 Example:
66 =cut
68 sub fetch {
69 my $self=shift;
70 my $GBaccession=shift; #GenBank accessions are stored in feature.name!
72 #Entrez display format GBSeqXML:
73 #http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=5&rettype=gb&retmode=xml
74 #http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=8&rettype=gp&retmode=xml
77 my $feature_xml = `wget "eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=$GBaccession&rettype=xml&retmode=text" -O - `;
79 eval{
80 my $twig=XML::Twig->new(
81 twig_roots =>
83 'Textseq-id/Textseq-id_accession' => \&name,
84 'Textseq-id/Textseq-id_version' => \&version,
85 'Seq-data_iupacna/IUPACna' => \&residues, #na sequences
86 'Seq-data_iupacaa/IUPACaa' => \&residues, #aa sequence
87 'Seq-inst/Seq-inst_length' => \&seqlen,
88 'Seqdesc/Seqdesc_title' => \&description,
89 'Org-ref/Org-ref_taxname' => \&organism_name,
90 'Org-ref_db/Dbtag/Dbtag_tag/Object-id/Object-id_id' => \&organism_taxon_id,
91 'PubMedId' => \&pubmed_id,
92 'Bioseq_id/Seq-id/Seq-id_gi' => \&accession, # accession refers to genBnk GI number
93 'MolInfo/MolInfo_biomol' => \&molecule_type,
95 twig_handlers =>
97 # AbstractText => \&abstract,
100 pretty_print => 'indented', # output will be nicely formatted
103 $twig->parse($feature_xml );
106 my $feature= $self->get_feature_object();
107 my $db_name= 'DB:GenBank_GI';
108 $feature->set_db_name($db_name);
109 $feature->set_uniquename($feature->get_name() . "." . $feature->get_version() );
112 $feature->d("name= " . $feature->get_name() . "!");
114 $feature->d("organism name= " . $feature->get_organism_name() . "!");
115 $feature->d("taxon_id= " . $feature->get_organism_taxon_id() . "!");
116 $feature->d("description= " . $feature->get_description() . "!");
117 $feature->d("version= " . $feature->get_version() . "!");
118 $feature->d("uniquname= " . $feature->get_uniquename() . "!");
119 $feature->d("molecule type= " . $feature->get_molecule_type() . "!");
120 $feature->d("description= " . $feature->get_description() . "!");
121 $feature->d("seqlen= " . $feature->get_seqlen() . "!");
122 $feature->d("residues= " . $feature->get_residues() . "!");
125 if($@) {
126 my $message= "!!NCBI server seems to be down. Please try again later!!.\n $@";
127 print STDERR $message;
128 $self->get_feature_object()->set_message($message);
129 }else { print STDERR "exiting FeatureFetch.pm!\n" ; }
132 sub get_feature_object {
133 my $self=shift;
134 return $self->{feature_object};
138 sub set_feature_object {
139 my $self=shift;
140 $self->{feature_object}=shift;
143 =head2 get_organism_name
145 Usage:
146 Desc: Retrieve the scientific organism name
147 Ret:
148 Args:
149 Side Effects:
150 Example:
152 =cut
154 sub get_organism_name {
155 my $self=shift;
156 return $self->{organism_name};
159 =head2 organism_name
161 Usage:
162 Desc: Store the scientific organism name in the feature object. This is used only for cleaner error messages.
163 Ret:
164 Args:
165 Side Effects:
166 Example:
168 =cut
170 sub organism_name {
171 my ($twig, $elt) = @_;
173 $tree{organism} = $elt->text();
174 print STDERR "\n**FeatureFetch found organism name : '". $elt->text() . "' \n";
175 #$twig->purge();
178 =head2 get_organism_taxon_id
180 Usage:
181 Desc: Retrieve the organisms genbank-given taxon_id
182 Ret:
183 Args:
184 Side Effects:
185 Example:
187 =cut
189 sub get_organism_taxon_id {
190 my $self=shift;
191 return $self->{organism_taxon_id};
194 =head2 organism_taxon_id
196 Usage:
197 Desc: Store the genbank taxon_id in the feature object
198 Ret:
199 Args:
200 Side Effects:
201 Example:
203 =cut
205 sub organism_taxon_id {
206 my ($twig, $elt) = @_;
207 print STDERR "calling organism_taxon_id!!\n";
208 $tree{taxon_id} = $elt->text();
209 if (lc($tree{name}) eq lc($feature_object->get_name()) ) {
210 $feature_object->set_organism_taxon_id($elt->text);
211 print STDERR "**the organism taxon id is : '". $feature_object->get_organism_taxon_id() . "' \n";
215 #$twig->purge();
218 sub get_name {
219 my $self=shift;
220 return $self->{name};
223 sub name {
224 my ($twig, $elt)= @_;
225 print STDERR "Calling name!! organism = " . $tree{organism} . ". taxon_id= " . $tree{taxon_id} . "\n";
226 my $name_data= $elt->text;
228 my $fname= $feature_object->get_name();
229 print STDERR "fname = $fname , name_data = $name_data******#####\n";
230 if (lc($fname) eq lc($name_data)) {
231 print STDERR "calling name with $name_data!!!!!!!!!!!!!!\n";
232 $tree{name} = $name_data;
233 $feature_object->set_organism_name($tree{organism}) ;
234 $feature_object->set_organism_taxon_id($tree{taxon_id});
236 $twig->purge;
238 } else { $tree{name}= undef; }
240 #$twig->purge;
243 sub get_accession {
244 my $self=shift;
245 return $self->{accession};
248 sub accession {
249 my ($twig, $elt)= @_;
250 #print STDERR "Calling accession!! Found: " . $elt->text() . " \n";
251 my $accession_data= $elt->text;
253 if (lc($tree{name}) eq lc($feature_object->get_name()) ) {
254 $feature_object->set_accession($elt->text);
256 #$twig->purge;
262 sub pubmed_id {
263 my ($twig, $elt)= @_;
264 my $pubmed_id = $elt->text;
265 print STDERR "Calling pubmed_id";
266 my @pubmed_ids = $feature_object->get_pubmed_ids();
268 my @already_exists = grep(/$pubmed_id/, @pubmed_ids);
270 if(!@already_exists){
271 $feature_object->add_pubmed_id($pubmed_id);
272 print STDERR "***Adding pubmed_id to array: $pubmed_id \n";
274 $twig->purge;
277 sub get_version {
278 my $self=shift;
279 return $self->{version};
282 sub version {
283 my ($twig, $elt)= @_;
284 print STDERR "Calling version!!\n";
285 if (lc($tree{name}) eq lc($feature_object->get_name()) ) {
286 $feature_object->set_version($elt->text);
289 $twig->purge;
292 sub get_residues {
293 my $self=shift;
294 print STDERR "Calling residues!!\n";
295 return $self->{residues};
298 sub residues {
299 my ($twig, $elt)= @_;
300 print STDERR "CAlling residues!! \n";
301 if (lc($tree{name}) eq lc($feature_object->get_name()) ) {
302 $feature_object->set_residues($elt->text);
304 $twig->purge;
307 sub get_seqlen {
308 my $self=shift;
310 return $self->{seqlen}
313 sub seqlen {
314 my ($twig, $elt)= @_;
315 print STDERR "calling seqlen!!\n";
316 if (lc($tree{name}) eq lc($feature_object->get_name()) ) {
317 $feature_object->set_seqlen($elt->text() ) ;
319 $twig->purge;
323 sub description {
324 my ($twig, $elt)= @_;
325 print STDERR "calling description!!\n";
327 if (lc($tree{name}) eq lc($feature_object->get_name()) ) {
328 $feature_object->set_description($elt->text);
331 $twig->purge;
334 sub get_description {
335 my $self = shift;
336 return $self->{description};
339 sub get_molecule_type {
340 my $self=shift;
341 print STDERR "calling molecule_type!!\n";
343 return $self->{molecule_type}
346 sub molecule_type {
347 my ($twig, $elt)= @_;
349 my %mol_hash = (
350 'mRNA' => 'mRNA',
351 '3' => 'mRNA',
352 'rRNA' => 'rRNA',
353 '4' => 'rRNA',
354 'scRNA' => 'scRNA',
355 '7' => 'scRNA',
356 'genomic DNA' => 'genomic_clone',
357 'genomic clone' => 'genomic_clone',
358 '1' => 'genomic_clone',
359 'genomic RNA' => 'RNA',
360 'Pre-RNA' => 'PRE-RNA',
361 'unassigned DNA' => 'DNA',
362 'unassigned RNA' => 'RNA',
363 'ss-RNA' => 'RNA',
364 'RNA' => 'RNA',
365 'DNA' =>'DNA',
366 'snRNA' =>'snRNA',
367 '8' => 'protein'
369 my $mol_text= ($elt->text);
370 my $molecule = $mol_hash{$mol_text};
371 if (!$molecule) {warn "no molecule type found for type $mol_text!!\n";}
372 else { print STDERR "molecule type = $molecule\n"; }
374 if (lc($tree{name}) eq lc($feature_object->get_name()) ) {
375 $feature_object->set_molecule_type($molecule);
377 $twig->purge;
380 #### DO NOT REMOVE
381 return 1;
382 ####