1 package CXGN
::Tools
::Gene
;
4 use base qw
| CXGN
::DB
::Object
|;
5 use CXGN
::DB
::Connection
;
6 use CXGN
::Tools
::Identifiers qw
/identifier_namespace/;
14 Fetch properties of a given unigene or arabidopsis gene,
15 such as annotation, sequence, or signalp data. If you
16 want it to do more, please add functions! I wanted
17 a unified object that could grab information about any
18 gene in our database, so this is just a start.
20 WARNING: This module throws an error if the identifier
21 is not recognized. In web scripts, construction of a
22 Gene should be wrapped in eval{}
26 my $gene = CXGN::Tools::Gene->new("AT1G01010.1");
28 if($gene->isSignalPositive){
29 print $gene->{id} . " is likely to be secreted";
32 #(mostly) works based on database column names from fetch:
33 my $nn_score = $gene->getProperty("nn_score");
36 my $cds = $gene->getSequence("cds");
37 my $annotation = $gene->getAnnotation();
42 #If a calling script/package has DBH defined as a package variable, attach it to this class
43 our $EXCHANGE_DBH = 1;
44 #All the database queries are in this INIT block
48 #.................................
49 #..... Instance Functions ........
53 Instantiate a new Gene object. As of now, the gene must
54 exist as an arabidopsis or unigene identifier in the database.
58 Support will be added for "empty" objects if sequence processing
59 is added. Also, I think store() methods aren't a good idea.
65 my $identifier = uc ( shift );
68 my $self = $class->SUPER::new
($dbh);
70 my $namespace = identifier_namespace
($identifier);
71 if ($namespace =~ /sgn_u/i) {
72 $self->{type
} = "uni";
73 $identifier =~ s/SGN-U//i;
74 $self->{id
} = $identifier;
75 } elsif ($namespace =~ /tair_gene_model/i) {
76 $self->{type
} = "ara";
77 $self->{id
} = $identifier;
78 } elsif ($namespace =~ /tair_locus/i) {
79 print STDERR
"Tair Locus passed ($identifier), using first gene model $identifier.1\n";
80 $self->{type
} = "ara";
81 $self->{id
} = $identifier . '.1';
82 } elsif ($namespace =~ /genbank_accession/i) {
83 $self->_resolve_ncbi($identifier);
84 die "NCBI identifier [$identifier] does not map to TAIR gene model" unless $self->{id
};
86 die "You must send a SGN-Unigene or TAIR Gene model (AT1G01010.1) identifier as the first argument to this class [$identifier]";
88 print STDERR
"[", $self->_check_exist_no_die(), "]\n";
89 return undef unless $self->_check_exist_no_die();
91 # $self->_check_existence;
96 sub _check_exist_no_die
{
98 #print STDERR "in _check_exist_no_die. Before get_sql \n";
99 my $sth = $self->get_sql($self->{type
} . "_check");
100 #print STDERR "in _check_exist_no_die. Before execute \n";
101 $sth->execute($self->{id
});
102 #print STDERR "in _check_exist_no_die. after execute \n";
103 my (@row) = $sth->fetchrow_array();
105 print STDERR
"Gene with type '$self->{type}' and id '$self->{id}' not found in database\n";
111 sub _check_existence
{
113 my $sth = $self->get_sql($self->{type
} . "_check");
114 $sth->execute($self->{id
});
115 my (@row) = $sth->fetchrow_array();
118 "Gene with type '$self->{type}' and id '$self->{id}' not found in database\n";
129 return unless identifier_namespace
($id) =~ /genbank_accession/i;
132 my $sth = $self->get_sql("ara_ncbi_check");
133 $sth->execute($id, $id);
135 my $row = $sth->fetchrow_hashref;
137 $self->{type
} = "ara";
138 $self->{id
} = $row->{agi
};
147 Argument: Query hash key [ See _set_query_hashes() ]
149 Side Effect: Populate object with selected information from database
156 my $query_key = lc ( shift );
158 if(!$query_key) { $query_key = "annot" }
161 my $sth = $self->get_sql($self->{type
}."_".$query_key);
163 $sth->execute($self->{id
});
165 #extra junk for getting unigene cdna sequence
166 if($query_key eq "seq" && $self->{type
} eq "uni"){
167 while(my $row = $sth->fetchrow_hashref){
168 unless ($self->property('protein_seq') && $row->{preferred
} ne 't'){
169 $self->property("seq_edits", $row->{seq_edits
});
170 $self->property("protein_seq", $row->{protein_seq
});
173 $self->fetch("general");
175 ($self->getProperty("nr_members")>1)?
($qt="consensi"):($qt="singleton");
176 my $sth = $self->get_sql("uni_".$qt);
178 $sth->execute($self->{id
});
179 $self->property("cdna", $sth->fetchrow_array());
180 return; #have all we need now
183 my $row = $sth->fetchrow_hashref;
184 while( my ($key, $value) = each %$row ) {
185 $self->property($key, $value);
193 Fetch all information from database pertaining to gene.
194 Convenient, but inefficient for mass process.
204 if($self->{type
} eq "uni"){
205 my @uni_specific = qw
/ blast_annot general consensi singleton /;
206 $self->fetch($_) foreach @uni_specific;
212 $self->fetch("sigp");
221 #Several rows, non-standard fetch:
223 # return unless $self->{type} eq "ara"; works for unigene now?
224 my $sth = $self->get_sql($self->{type
}."_domain");
225 $sth->execute($self->{id
});
226 $self->{domains
} = [];
227 if($self->{type
} eq "ara"){
228 while(my $row = $sth->fetchrow_hashref){
229 push(@
{$self->{domains
}}, $row);
233 #Only get rows for one of the cds_id's, in case multiple cds_id's
234 #exist for a particular unigene_id:
235 my $row = $sth->fetchrow_hashref;
237 my $cds_id = $row->{dom_cds_id
};
240 my $ipr = $row->{dom_interpro_id
};
242 $ipr = "IPR" . sprintf("%06s", $ipr);
243 $row->{dom_interpro_id
} = $ipr;
245 push(@
{$self->{domains
}}, $row);
246 while(my $row = $sth->fetchrow_hashref){
247 push(@
{$self->{domains
}}, $row) if ($row->{dom_cds_id
} eq $cds_id);
254 return unless $self->{type
} eq "ara"; #Arabidopsis only, for now
255 my $sth = $self->get_sql("ara_go");
256 $sth->execute($self->{id
});
259 $gref->{$_} = [] foreach(qw
/comp func proc/);
261 while(my $row = $sth->fetchrow_hashref) {
263 $prop->{$_} = $row->{$_} foreach(qw
/go_id evidence content pmid/);
264 push @
{$gref->{$row->{type
}}}, $prop;
270 sub getGO
{ get_go
(@_) }
274 $self->fetch_go unless($self->{go
});
281 return undef unless $k;
282 $self->{db_keys
}->{$k} = $v if defined $v;
283 return $self->{db_keys
}->{$k};
286 =head3 get_property($key)
288 Return a property of the gene, based on the column name in the database
289 under which the information can be found. There are some exceptions
290 to this rule, such as with the cdna sequence in Unigenes, and for aliased
291 query returns. See the _prepare_global_queries() section for key aliases.
301 return $self->{db_keys
}->{$key};
307 Just like getProperty, except it takes and returns an array
313 return map { $self->property($_) } @_;
318 unless($self->_check_key("species")){
319 $self->property("species", "Arabidopsis T.") if $self->{type
} eq "ara";
320 $self->fetch("general") if $self->{type
} eq "uni";
322 print STDERR
"in Gene::getSpecies, species: ", $self->property("species"), "\n";
323 return $self->property("species");
328 Arguments: 'genomic', 'cdna', 'cds', or 'protein'
330 Issues: No 'genomic' support for Unigenes
336 my $seq_type = shift;
337 die "Only supported seq_types for sequences: protein, cds, cdna, genomic"
338 unless $seq_type =~ /(protein)|(cds)|(cdna)|(genomic)/;
340 $self->fetch("seq") unless $self->_check_key($seq_type);
341 if($self->{type
} eq "uni"){
342 return $self->property("seq_edits") if $seq_type eq "cds";
343 return $self->property("protein_seq") if $seq_type eq "protein";
344 return $self->property("cdna") if $seq_type eq "cdna";
346 #Ok, so you can't get a REAL genomic sequence
347 warn "No 'genomic' sequence supported for Unigenes yet" if $seq_type eq "genomic";
350 if($self->{type
} eq "ara"){
351 return $self->property($seq_type);
357 if ($self->{type
} eq "ara") {
358 $self->fetch("annot") unless $self->_check_key("tair_annotation");
359 return $self->property("tair_annotation");
361 if ($self->{type
} eq "uni") {
362 $self->fetch("blast_annot") unless $self->_check_key("blast_annot_content");
363 $self->fetch("annot") unless $self->_check_key("annot_content");
364 my $manual = $self->property("annot_content");
365 my $blast = $self->property("blast_annot_content");
366 return $manual if $manual;
368 #Trim off the fluff. If you need it, use fetch("blast_annot")
369 # and getProperty("blast_annot_content");
371 if($blast =~ /^\s*AT[1-5MC]G/i){
372 ($agi) = $blast =~ /(AT[1-5MC]G\d{5}\.\d+)/i;
374 $blast =~ s/.*?\|.*?\|\s*//;
375 $blast =~ s/\s*\|.*//;
376 $blast = "[BLAST: $agi] $blast" if $agi;
378 elsif($blast =~ /^\s*gi\s*\|/i){
379 ($ncbi_id) = $blast =~ /\s*gi\s*\|\s*\d+\s*\|\s*\w+\s*\|\s*(\S+)\s*\|/i;
380 $blast =~ s/.*?\|.*?\|.*?\|.*?\|//;
381 $blast = "[BLAST: $ncbi_id] $blast" if $ncbi_id;
383 $blast = "[BLAST] $blast" unless($ncbi_id || $agi);
385 return $blast if $blast;
387 print STDERR
"No annotation available for " . $self->{id
};
394 Returns an array of hash references, one for each domain:
395 dom_desc => "NAME111"
396 dom_full_desc => "Actual description"
397 dom_start => start aa position
398 dom_end => end aa position
399 dom_interpro_id => intropro accession # for domain
405 $self->fetch_dom unless (ref $self->{domains
} && @
{$self->{domains
}});
406 return @
{$self->{domains
}};
409 =head3 SignalP Getters
411 getSignalScore() - neural network score
412 getCleavagePosition() - AA # to the right of cleavage point
413 isSignalPositive() - 1 if predicted secretion, 0 if not
414 getSignalPeptide() - returns signal peptide sequence if positive, null string otherwise
415 getCleavedSequence() - returns protein sequence without signal peptide if signal positive,
416 otherwise returns a null string
422 $self->fetch_sigp unless $self->_check_key("nn_score");
423 return $self->property("nn_score");
426 sub getSignalPeptide
{
428 $self->fetch_sigp unless $self->_check_key("nn_ypos");
429 my $protseq = $self->get_sequence("protein");
430 if($self->is_signal_positive){
431 my $cp = $self->get_cleavage_position();
432 return substr($protseq, 0, $cp-1);
439 sub getCleavedSequence
{
441 $self->fetch_sigp unless $self->_check_key("nn_ypos");
442 my $protseq = $self->get_sequence("protein");
443 if($self->is_signal_positive){
444 my $cp = $self->get_cleavage_position();
445 return substr($protseq, $cp-1);
452 sub getCleavagePosition
{
454 $self->fetch_sigp unless $self->_check_key("nn_ypos");
455 return $self->property("nn_ypos");
458 sub isSignalPositive
{
460 $self->fetch_sigp unless $self->_check_key("nn_d");
461 my $dec = $self->property("nn_d");
469 #There is a serious problem with signalp formats that needs to be fixed immediately:
470 warn "\nSignalProblem: NN decision score ($dec) not recognized for " . $self->{id
} . ", returning false\n";
475 #Temporary sub aliases, get rid of that camel case! What was I thinking?
476 *get_property
= \
&getProperty
;
477 *get_properties
= \
&getProperties
;
478 *get_species
= \
&getSpecies
;
479 *get_sequence
= \
&getSequence
;
480 *get_annotation
= \
&getAnnotation
;
481 *get_domains
= \
&getDomains
;
482 *get_signal_score
= \
&getSignalScore
;
483 *get_signal_peptide
= \
&getSignalPeptide
;
484 *get_cleaved_sequence
= \
&getCleavedSequence
;
485 *get_cleavage_position
= \
&getCleavagePosition
;
486 *is_signal_positive
= \
&isSignalPositive
;
491 unless(exists $self->{db_keys
}->{$key}){
500 # __PACKAGE__->required_search_paths(qw/public sgn/);
508 "SELECT agi FROM public.ara_properties WHERE agi=?",
512 "SELECT * FROM ara_properties JOIN ara_annotation USING(agi) WHERE agi=?",
516 "SELECT * FROM ara_signalp WHERE agi=?",
520 "SELECT * FROM ara_sequence WHERE agi=?",
524 "SELECT dom_desc, dom_start, dom_end,
525 interpro_id AS dom_interpro_id,
526 interpro_dom AS dom_full_desc
540 ara_protein_length
=>
542 "SELECT LENGTH(protein) AS protein_length FROM ara_sequence WHERE agi=?",
547 SELECT type, go_id, evidence, content, pmid
558 "SELECT cds_id AS dom_cds_id,
560 match_begin AS dom_start,
561 match_end AS dom_end,
562 interpro_id AS dom_interpro_id,
563 description AS dom_desc
565 LEFT JOIN domain USING (domain_id)
572 "SELECT unigene_id FROM unigene WHERE unigene_id=?",
577 SELECT seq_edits, protein_seq
585 SELECT * FROM signalp
601 sgn_people.sp_person.first_name || ' ' || sgn_people.sp_person.last_name AS annot_contributor,
602 manual_annotations.date_entered AS annot_date_entered,
603 manual_annotations.last_modified AS annot_last_modified,
604 manual_annotations.annotation_text AS annot_content,
605 clone.clone_name AS annot_clone_name
607 LEFT JOIN unigene_member USING (unigene_id)
608 LEFT JOIN est USING (est_id)
609 LEFT JOIN seqread USING (read_id)
610 LEFT JOIN clone USING (clone_id)
611 LEFT JOIN manual_annotations ON (clone.clone_id = manual_annotations.annotation_target_id)
612 LEFT JOIN sgn_people.sp_person ON (manual_annotations.author_id = sgn_people.sp_person.sp_person_id)
613 LEFT JOIN annotation_target_type ON (manual_annotations.annotation_target_type_id = annotation_target_type.annotation_target_type_id)
616 AND annotation_target_type.type_name='clone'
622 SELECT evalue AS blast_annot_evalue, score AS blast_annot_score,
623 identity_percentage AS blast_annot_identity_percentage,
624 defline AS blast_annot_content
626 LEFT JOIN blast_defline USING (defline_id)
627 WHERE blast_annotation_id=
629 SELECT blast_annotation_id
630 FROM blast_annotations
631 LEFT JOIN blast_targets USING (blast_target_id)
632 WHERE apply_id=? and apply_type=15 LIMIT 1
643 unigene.database_name,
644 unigene.sequence_name,
645 unigene_build.build_nr,
646 unigene_build.build_date,
647 unigene_build.status,
648 unigene_build.comment AS build_comment,
649 groups.comment AS species
651 LEFT JOIN unigene_build USING (unigene_build_id)
652 LEFT JOIN groups ON (organism_group_id=group_id)
661 LEFT JOIN unigene_consensi USING (consensi_id)
668 " SELECT COALESCE(substring(seq FROM (hqi_start)::int+1 FOR (hqi_length)::int ),seq)
670 LEFT JOIN unigene_member USING (unigene_id)
671 LEFT JOIN est USING (est_id)
672 LEFT JOIN qc_report USING (est_id)
677 uni_protein_length
=>
680 SELECT LENGTH(protein_seq) AS protein_length
691 while(my ($name, $query) = each %{$self->{queries
}}){
692 $self->{query_handles
}->{$name}=$self->get_dbh()->prepare($query);
699 return $self->{query_handles
}->{$name};