1 package CXGN
::Tools
::FeatureFetch
;
6 use CXGN
::Chado
::Feature
;
8 use CXGN
::Tools
::Entrez
;
12 =head1 CXGN::Tools::FeatureFetch
14 get data from the NucleotideCore site and parse the necessary fields to fill a feature object
23 Naama Menda (nm249@cornell.edu)
29 Usage: my $feature_fetch = CXGN::Tools::FeatureFetch->new($feature_obj);
38 our $feature_object=undef;
39 our %tree=(); # (name =>$tree_name , organism=>$tree_organism, taxon_id=>$taxon_id) ;
43 $feature_object= shift;
46 my $self = bless $args, $class;
48 $self->set_feature_object($feature_object);
50 my $GBaccession= $feature_object->get_name();
52 $self->fetch($GBaccession);
59 Usage: CXGN::Tools::featureFetch->fetch($genBank_accession);
62 Args: $genBank_accession
70 my $GBaccession=shift; #GenBank accessions are stored in feature.name!
72 #Entrez display format GBSeqXML:
73 #http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=5&rettype=gb&retmode=xml
74 #http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=8&rettype=gp&retmode=xml
77 my $feature_xml = `wget "eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=$GBaccession&rettype=xml&retmode=text" -O - `;
80 my $twig=XML
::Twig
->new(
83 'Textseq-id/Textseq-id_accession' => \
&name
,
84 'Textseq-id/Textseq-id_version' => \
&version
,
85 'Seq-data_iupacna/IUPACna' => \
&residues
, #na sequences
86 'Seq-data_iupacaa/IUPACaa' => \
&residues
, #aa sequence
87 'Seq-inst/Seq-inst_length' => \
&seqlen
,
88 'Seqdesc/Seqdesc_title' => \
&description
,
89 'Org-ref/Org-ref_taxname' => \
&organism_name
,
90 'Org-ref_db/Dbtag/Dbtag_tag/Object-id/Object-id_id' => \
&organism_taxon_id
,
91 'PubMedId' => \
&pubmed_id
,
92 'Bioseq_id/Seq-id/Seq-id_gi' => \
&accession
, # accession refers to genBnk GI number
93 'MolInfo/MolInfo_biomol' => \
&molecule_type
,
97 # AbstractText => \&abstract,
100 pretty_print
=> 'indented', # output will be nicely formatted
103 $twig->parse($feature_xml );
106 my $feature= $self->get_feature_object();
107 my $db_name= 'DB:GenBank_GI';
108 $feature->set_db_name($db_name);
109 $feature->set_uniquename($feature->get_name() . "." . $feature->get_version() );
112 $feature->d("name= " . $feature->get_name() . "!");
114 $feature->d("organism name= " . $feature->get_organism_name() . "!");
115 $feature->d("taxon_id= " . $feature->get_organism_taxon_id() . "!");
116 $feature->d("description= " . $feature->get_description() . "!");
117 $feature->d("version= " . $feature->get_version() . "!");
118 $feature->d("uniquname= " . $feature->get_uniquename() . "!");
119 $feature->d("molecule type= " . $feature->get_molecule_type() . "!");
120 $feature->d("description= " . $feature->get_description() . "!");
121 $feature->d("seqlen= " . $feature->get_seqlen() . "!");
122 $feature->d("residues= " . $feature->get_residues() . "!");
126 my $message= "!!NCBI server seems to be down. Please try again later!!.\n $@";
127 print STDERR
$message;
128 $self->get_feature_object()->set_message($message);
129 }else { print STDERR
"exiting FeatureFetch.pm!\n" ; }
132 sub get_feature_object
{
134 return $self->{feature_object
};
138 sub set_feature_object
{
140 $self->{feature_object
}=shift;
143 =head2 get_organism_name
146 Desc: Retrieve the scientific organism name
154 sub get_organism_name
{
156 return $self->{organism_name
};
162 Desc: Store the scientific organism name in the feature object. This is used only for cleaner error messages.
171 my ($twig, $elt) = @_;
173 $tree{organism
} = $elt->text();
174 print STDERR
"\n**FeatureFetch found organism name : '". $elt->text() . "' \n";
178 =head2 get_organism_taxon_id
181 Desc: Retrieve the organisms genbank-given taxon_id
189 sub get_organism_taxon_id
{
191 return $self->{organism_taxon_id
};
194 =head2 organism_taxon_id
197 Desc: Store the genbank taxon_id in the feature object
205 sub organism_taxon_id
{
206 my ($twig, $elt) = @_;
207 print STDERR
"calling organism_taxon_id!!\n";
208 $tree{taxon_id
} = $elt->text();
209 if (lc($tree{name
}) eq lc($feature_object->get_name()) ) {
210 $feature_object->set_organism_taxon_id($elt->text);
211 print STDERR
"**the organism taxon id is : '". $feature_object->get_organism_taxon_id() . "' \n";
220 return $self->{name
};
224 my ($twig, $elt)= @_;
225 print STDERR
"Calling name!! organism = " . $tree{organism
} . ". taxon_id= " . $tree{taxon_id
} . "\n";
226 my $name_data= $elt->text;
228 my $fname= $feature_object->get_name();
229 print STDERR
"fname = $fname , name_data = $name_data******#####\n";
230 if (lc($fname) eq lc($name_data)) {
231 print STDERR
"calling name with $name_data!!!!!!!!!!!!!!\n";
232 $tree{name
} = $name_data;
233 $feature_object->set_organism_name($tree{organism
}) ;
234 $feature_object->set_organism_taxon_id($tree{taxon_id
});
238 } else { $tree{name
}= undef; }
245 return $self->{accession
};
249 my ($twig, $elt)= @_;
250 #print STDERR "Calling accession!! Found: " . $elt->text() . " \n";
251 my $accession_data= $elt->text;
253 if (lc($tree{name
}) eq lc($feature_object->get_name()) ) {
254 $feature_object->set_accession($elt->text);
263 my ($twig, $elt)= @_;
264 my $pubmed_id = $elt->text;
265 print STDERR
"Calling pubmed_id";
266 my @pubmed_ids = $feature_object->get_pubmed_ids();
268 my @already_exists = grep(/$pubmed_id/, @pubmed_ids);
270 if(!@already_exists){
271 $feature_object->add_pubmed_id($pubmed_id);
272 print STDERR
"***Adding pubmed_id to array: $pubmed_id \n";
279 return $self->{version
};
283 my ($twig, $elt)= @_;
284 print STDERR
"Calling version!!\n";
285 if (lc($tree{name
}) eq lc($feature_object->get_name()) ) {
286 $feature_object->set_version($elt->text);
294 print STDERR
"Calling residues!!\n";
295 return $self->{residues
};
299 my ($twig, $elt)= @_;
300 print STDERR
"CAlling residues!! \n";
301 if (lc($tree{name
}) eq lc($feature_object->get_name()) ) {
302 $feature_object->set_residues($elt->text);
310 return $self->{seqlen
}
314 my ($twig, $elt)= @_;
315 print STDERR
"calling seqlen!!\n";
316 if (lc($tree{name
}) eq lc($feature_object->get_name()) ) {
317 $feature_object->set_seqlen($elt->text() ) ;
324 my ($twig, $elt)= @_;
325 print STDERR
"calling description!!\n";
327 if (lc($tree{name
}) eq lc($feature_object->get_name()) ) {
328 $feature_object->set_description($elt->text);
334 sub get_description
{
336 return $self->{description
};
339 sub get_molecule_type
{
341 print STDERR
"calling molecule_type!!\n";
343 return $self->{molecule_type
}
347 my ($twig, $elt)= @_;
356 'genomic DNA' => 'genomic_clone',
357 'genomic clone' => 'genomic_clone',
358 '1' => 'genomic_clone',
359 'genomic RNA' => 'RNA',
360 'Pre-RNA' => 'PRE-RNA',
361 'unassigned DNA' => 'DNA',
362 'unassigned RNA' => 'RNA',
369 my $mol_text= ($elt->text);
370 my $molecule = $mol_hash{$mol_text};
371 if (!$molecule) {warn "no molecule type found for type $mol_text!!\n";}
372 else { print STDERR
"molecule type = $molecule\n"; }
374 if (lc($tree{name
}) eq lc($feature_object->get_name()) ) {
375 $feature_object->set_molecule_type($molecule);