a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / FeatureTree.pm
blob73f18eb1b2fdcf78cb1a09c8a2be1a4c8e7befa0
1 package CXGN::Tools::FeatureTree;
2 use strict;
4 use XML::Twig;
6 use CXGN::Chado::Feature;
8 use CXGN::Tools::Entrez;
12 =head1 CXGN::Tools::FeatureTree
14 get data from the NucleotideCore site and parse the necessary fields to fill feature objects
17 =head2
20 =head1 Authors
23 Naama Menda (nm249@cornell.edu)
25 =cut
27 =head2 new
29 Usage: my $feature_fetch = CXGN::Tools::FeatureTree->new($GBaccession);
30 Desc:
31 Ret:
32 Args: genbank accession
33 Side Effects:
34 Example:
36 =cut
38 #our $feature_object=undef;
39 our @Ftree=();
41 sub new {
42 my $class = shift;
43 my $gb= shift;
45 my $args = {};
46 my $self = bless $args, $class;
48 if ($gb) {
49 @Ftree=();
50 $self->fetch($gb);
52 return $self;
55 sub fetch {
56 my $self=shift;
57 my $gb=shift; #GenBank accessions are stored in feature.name!
59 my $feature_xml = `wget "eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=$gb&rettype=xml&retmode=text" -O - `;
61 eval{
62 my $twig=XML::Twig->new(
63 twig_roots =>
65 'Textseq-id/Textseq-id_accession' => \&name,
66 'Textseq-id/Textseq-id_version' => \&version,
67 'Seq-data_iupacna/IUPACna' => \&residues,
68 'Seq-inst/Seq-inst_length' => \&seqlen,
69 'Seqdesc/Seqdesc_title' => \&description,
70 'Org-ref/Org-ref_taxname' => \&organism_name,
71 'Org-ref_db/Dbtag/Dbtag_tag/Object-id/Object-id_id' => \&organism_taxon_id,
72 'PubMedId' => \&pubmed_id,
73 'Bioseq_id/Seq-id/Seq-id_gi' =>\&accession, # accession refers to genBnk GI number
74 'MolInfo/MolInfo_biomol' =>\&molecule_type,
76 twig_handlers =>
78 # AbstractText => \&abstract,
81 pretty_print => 'indented', # output will be nicely formatted
82 );
84 $twig->parse($feature_xml );
85 };
86 if($@) {
87 my $message= "!!NCBI server seems to be down. Please try again later!!.\n $@";
88 print STDERR $message;
89 return $message;
90 }else {
91 print STDERR "exiting FeatureTree.pm!\n";
92 return undef ;
96 sub get_feature_list {
97 my $self=shift;
98 return @Ftree;
101 =head2 organism_name
103 Usage:
104 Desc: Store the scientific organism name. This is used only for cleaner error messages.
105 Ret:
106 Args:
107 Side Effects:
108 Example:
110 =cut
112 sub organism_name {
113 my ($twig, $elt) = @_;
114 my $o_name=$elt->text;
115 push @ {$Ftree[0] }, $o_name;
116 $twig->purge();
120 =head2 organism_taxon_id
122 Usage:
123 Desc: Store the genbank taxon_id
124 Ret:
125 Args:
126 Side Effects:
127 Example:
129 =cut
131 sub organism_taxon_id {
132 my ($twig, $elt) = @_;
133 my $o_taxon_id=$elt->text;
134 push @ {$Ftree[1] }, $o_taxon_id;
135 $twig->purge();
138 sub name {
139 my ($twig, $elt)= @_;
141 my $name_data= $elt->text;
142 push @{ $Ftree[2] }, $name_data;
143 #print STDERR "**name (genbank accession) $name_data\n";
144 $twig->purge;
148 sub accession {
149 my ($twig, $elt)= @_;
151 my $gi= $elt->text;
152 push @{ $Ftree[3] }, $gi if (!grep{/^$gi$/ } @{ $Ftree[3] } );
153 $twig->purge;
156 sub pubmed_id {
157 my ($twig, $elt)= @_;
158 my $pubmed_id = $elt->text;
160 # my @pubmed_ids = $feature_object->get_pubmed_ids();
161 # my @already_exists = grep(/$pubmed_id/, @pubmed_ids);
162 #if(!@already_exists){
163 # $feature_object->add_pubmed_id($pubmed_id);
164 push @ { $Ftree[4] }, $pubmed_id if (!grep{/^$pubmed_id$/ } @{ $Ftree[4] } );
165 $twig->purge;
169 sub version {
170 my ($twig, $elt)= @_;
171 my $version= $elt->text;
172 push @ { $Ftree[5] }, $version;
173 $twig->purge;
177 sub residues {
178 my ($twig, $elt)= @_;
179 my $res=$elt->text;
180 push @ { $Ftree[6] } , $res;
181 $twig->purge;
185 sub seqlen {
186 my ($twig, $elt)= @_;
187 my $seqlen=$elt->text;
188 push @ { $Ftree[7] } , $seqlen;
189 $twig->purge;
192 sub description {
193 my ($twig, $elt)= @_;
194 my $desc=$elt->text;
195 push @ { $Ftree[8] } , $desc;
197 $twig->purge;
201 sub molecule_type {
202 my ($twig, $elt)= @_;
204 my %mol_hash = (
205 'mRNA' => 'mRNA',
206 '3' => 'mRNA',
207 'rRNA' => 'rRNA',
208 '4' => 'rRNA',
209 'scRNA' => 'scRNA',
210 '7' => 'scRNA',
211 'genomic DNA' => 'genomic_clone',
212 'genomic clone' => 'genomic_clone',
213 '1' => 'genomic_clone',
214 'genomic RNA' => 'RNA',
215 'Pre-RNA' => 'PRE-RNA',
216 'unassigned DNA' => 'DNA',
217 'unassigned RNA' => 'RNA',
218 'ss-RNA' => 'RNA',
219 'RNA' => 'RNA',
220 'DNA' =>'DNA',
221 'snRNA' =>'snRNA',
222 '8' => 'protein'
224 my $mol_text= ($elt->text);
225 my $molecule = $mol_hash{$mol_text};
226 if (!$molecule) {
227 warn "no molecule type found for type $mol_text!!\n";}
228 else {
229 print STDERR "molecule type = $molecule\n\n";
230 push @ { $Ftree[9] } , $molecule;
232 $twig->purge;
235 #### DO NOT REMOVE
236 return 1;
237 ####