a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / Gene.pm
blobdd9c7426d337718ae038402fcaf6c54e2f8e1669
1 package CXGN::Tools::Gene;
2 use strict;
3 no strict 'refs';
4 use base qw | CXGN::DB::Object |;
5 use CXGN::DB::Connection;
6 use CXGN::Tools::Identifiers qw/identifier_namespace/;
8 =head1 NAME
10 CXGN::Tools::Gene
12 =head1 SYNOPSIS
14 Fetch properties of a given unigene or arabidopsis gene,
15 such as annotation, sequence, or signalp data. If you
16 want it to do more, please add functions! I wanted
17 a unified object that could grab information about any
18 gene in our database, so this is just a start.
20 WARNING: This module throws an error if the identifier
21 is not recognized. In web scripts, construction of a
22 Gene should be wrapped in eval{}
24 =head1 USAGE
26 my $gene = CXGN::Tools::Gene->new("AT1G01010.1");
27 $gene->fetch;
28 if($gene->isSignalPositive){
29 print $gene->{id} . " is likely to be secreted";
32 #(mostly) works based on database column names from fetch:
33 my $nn_score = $gene->getProperty("nn_score");
35 #custom getters:
36 my $cds = $gene->getSequence("cds");
37 my $annotation = $gene->getAnnotation();
38 #write more!
40 =cut
42 #If a calling script/package has DBH defined as a package variable, attach it to this class
43 our $EXCHANGE_DBH = 1;
44 #All the database queries are in this INIT block
48 #.................................
49 #..... Instance Functions ........
51 =head3 new()
53 Instantiate a new Gene object. As of now, the gene must
54 exist as an arabidopsis or unigene identifier in the database.
55 Argument: Identifier
56 Returns: Gene object
58 Support will be added for "empty" objects if sequence processing
59 is added. Also, I think store() methods aren't a good idea.
61 =cut
63 sub new {
64 my $class = shift;
65 my $identifier = uc ( shift );
66 my $dbh = shift;
68 my $self = $class->SUPER::new($dbh);
69 $self->set_sql();
70 my $namespace = identifier_namespace($identifier);
71 if ($namespace =~ /sgn_u/i) {
72 $self->{type} = "uni";
73 $identifier =~ s/SGN-U//i;
74 $self->{id} = $identifier;
75 } elsif ($namespace =~ /tair_gene_model/i) {
76 $self->{type} = "ara";
77 $self->{id} = $identifier;
78 } elsif ($namespace =~ /tair_locus/i) {
79 print STDERR "Tair Locus passed ($identifier), using first gene model $identifier.1\n";
80 $self->{type} = "ara";
81 $self->{id} = $identifier . '.1';
82 } elsif ($namespace =~ /genbank_accession/i) {
83 $self->_resolve_ncbi($identifier);
84 die "NCBI identifier [$identifier] does not map to TAIR gene model" unless $self->{id};
85 } else {
86 die "You must send a SGN-Unigene or TAIR Gene model (AT1G01010.1) identifier as the first argument to this class [$identifier]";
88 print STDERR "[", $self->_check_exist_no_die(), "]\n";
89 return undef unless $self->_check_exist_no_die();
91 # $self->_check_existence;
93 return $self;
96 sub _check_exist_no_die {
97 my $self = shift;
98 #print STDERR "in _check_exist_no_die. Before get_sql \n";
99 my $sth = $self->get_sql($self->{type} . "_check");
100 #print STDERR "in _check_exist_no_die. Before execute \n";
101 $sth->execute($self->{id});
102 #print STDERR "in _check_exist_no_die. after execute \n";
103 my (@row) = $sth->fetchrow_array();
104 unless(@row){
105 print STDERR "Gene with type '$self->{type}' and id '$self->{id}' not found in database\n";
106 return 0;
108 return 1;
111 sub _check_existence {
112 my $self = shift;
113 my $sth = $self->get_sql($self->{type} . "_check");
114 $sth->execute($self->{id});
115 my (@row) = $sth->fetchrow_array();
116 unless(@row){
117 die
118 "Gene with type '$self->{type}' and id '$self->{id}' not found in database\n";
122 sub _resolve_ncbi {
123 my $self = shift;
124 my $id = shift;
125 unless($id){
126 $id = $self->{id};
128 return unless $id;
129 return unless identifier_namespace($id) =~ /genbank_accession/i;
132 my $sth = $self->get_sql("ara_ncbi_check");
133 $sth->execute($id, $id);
135 my $row = $sth->fetchrow_hashref;
136 if($row->{agi}){
137 $self->{type} = "ara";
138 $self->{id} = $row->{agi};
139 return $self->{id};
141 return;
145 =head3 fetch()
147 Argument: Query hash key [ See _set_query_hashes() ]
148 Returns: Nothing
149 Side Effect: Populate object with selected information from database
151 =cut
154 sub fetch {
155 my $self = shift;
156 my $query_key = lc ( shift );
157 my $option = shift;
158 if(!$query_key) { $query_key = "annot" }
159 my %query_hash;
161 my $sth = $self->get_sql($self->{type}."_".$query_key);
163 $sth->execute($self->{id});
165 #extra junk for getting unigene cdna sequence
166 if($query_key eq "seq" && $self->{type} eq "uni"){
167 while(my $row = $sth->fetchrow_hashref){
168 unless ($self->property('protein_seq') && $row->{preferred} ne 't'){
169 $self->property("seq_edits", $row->{seq_edits});
170 $self->property("protein_seq", $row->{protein_seq});
173 $self->fetch("general");
174 my $qt;
175 ($self->getProperty("nr_members")>1)?($qt="consensi"):($qt="singleton");
176 my $sth = $self->get_sql("uni_".$qt);
178 $sth->execute($self->{id});
179 $self->property("cdna", $sth->fetchrow_array());
180 return; #have all we need now
183 my $row = $sth->fetchrow_hashref;
184 while( my ($key, $value) = each %$row ) {
185 $self->property($key, $value);
191 =head3 fetch_all
193 Fetch all information from database pertaining to gene.
194 Convenient, but inefficient for mass process.
196 =cut
198 sub fetch_all {
199 my $self = shift;
200 $self->fetch();
201 $self->fetch_sigp();
202 $self->fetch_seq();
203 $self->fetch_dom();
204 if($self->{type} eq "uni"){
205 my @uni_specific = qw/ blast_annot general consensi singleton /;
206 $self->fetch($_) foreach @uni_specific;
210 sub fetch_sigp {
211 my $self = shift;
212 $self->fetch("sigp");
215 sub fetch_seq {
216 my $self = shift;
217 $self->fetch("seq");
220 sub fetch_dom {
221 #Several rows, non-standard fetch:
222 my $self = shift;
223 # return unless $self->{type} eq "ara"; works for unigene now?
224 my $sth = $self->get_sql($self->{type}."_domain");
225 $sth->execute($self->{id});
226 $self->{domains} = [];
227 if($self->{type} eq "ara"){
228 while(my $row = $sth->fetchrow_hashref){
229 push(@{$self->{domains}}, $row);
232 else {
233 #Only get rows for one of the cds_id's, in case multiple cds_id's
234 #exist for a particular unigene_id:
235 my $row = $sth->fetchrow_hashref;
236 return unless $row;
237 my $cds_id = $row->{dom_cds_id};
238 #fix interpro id's
240 my $ipr = $row->{dom_interpro_id};
241 if($ipr =~ /^\d+$/){
242 $ipr = "IPR" . sprintf("%06s", $ipr);
243 $row->{dom_interpro_id} = $ipr;
245 push(@{$self->{domains}}, $row);
246 while(my $row = $sth->fetchrow_hashref){
247 push(@{$self->{domains}}, $row) if ($row->{dom_cds_id} eq $cds_id);
252 sub fetch_go {
253 my $self = shift;
254 return unless $self->{type} eq "ara"; #Arabidopsis only, for now
255 my $sth = $self->get_sql("ara_go");
256 $sth->execute($self->{id});
258 my $gref = {};
259 $gref->{$_} = [] foreach(qw/comp func proc/);
261 while(my $row = $sth->fetchrow_hashref) {
262 my $prop = {};
263 $prop->{$_} = $row->{$_} foreach(qw/go_id evidence content pmid/);
264 push @{$gref->{$row->{type}}}, $prop;
266 $self->{go} = $gref;
270 sub getGO { get_go(@_) }
272 sub get_go {
273 my $self = shift;
274 $self->fetch_go unless($self->{go});
275 return $self->{go};
278 sub property {
279 my $self = shift;
280 my ($k,$v) = @_;
281 return undef unless $k;
282 $self->{db_keys}->{$k} = $v if defined $v;
283 return $self->{db_keys}->{$k};
286 =head3 get_property($key)
288 Return a property of the gene, based on the column name in the database
289 under which the information can be found. There are some exceptions
290 to this rule, such as with the cdna sequence in Unigenes, and for aliased
291 query returns. See the _prepare_global_queries() section for key aliases.
293 Arguments: key name
294 Returns: value
296 =cut
298 sub getProperty {
299 my $self = shift;
300 my $key = shift;
301 return $self->{db_keys}->{$key};
305 =head3 getProperties
307 Just like getProperty, except it takes and returns an array
309 =cut
311 sub getProperties {
312 my $self = shift;
313 return map { $self->property($_) } @_;
316 sub getSpecies {
317 my $self = shift;
318 unless($self->_check_key("species")){
319 $self->property("species", "Arabidopsis T.") if $self->{type} eq "ara";
320 $self->fetch("general") if $self->{type} eq "uni";
322 print STDERR "in Gene::getSpecies, species: ", $self->property("species"), "\n";
323 return $self->property("species");
326 =head3 getSequence
328 Arguments: 'genomic', 'cdna', 'cds', or 'protein'
329 Returns: A sequence
330 Issues: No 'genomic' support for Unigenes
332 =cut
334 sub getSequence {
335 my $self = shift;
336 my $seq_type = shift;
337 die "Only supported seq_types for sequences: protein, cds, cdna, genomic"
338 unless $seq_type =~ /(protein)|(cds)|(cdna)|(genomic)/;
340 $self->fetch("seq") unless $self->_check_key($seq_type);
341 if($self->{type} eq "uni"){
342 return $self->property("seq_edits") if $seq_type eq "cds";
343 return $self->property("protein_seq") if $seq_type eq "protein";
344 return $self->property("cdna") if $seq_type eq "cdna";
346 #Ok, so you can't get a REAL genomic sequence
347 warn "No 'genomic' sequence supported for Unigenes yet" if $seq_type eq "genomic";
348 return undef;
350 if($self->{type} eq "ara"){
351 return $self->property($seq_type);
355 sub getAnnotation {
356 my $self = shift;
357 if ($self->{type} eq "ara") {
358 $self->fetch("annot") unless $self->_check_key("tair_annotation");
359 return $self->property("tair_annotation");
361 if ($self->{type} eq "uni") {
362 $self->fetch("blast_annot") unless $self->_check_key("blast_annot_content");
363 $self->fetch("annot") unless $self->_check_key("annot_content");
364 my $manual = $self->property("annot_content");
365 my $blast = $self->property("blast_annot_content");
366 return $manual if $manual;
367 if($blast){
368 #Trim off the fluff. If you need it, use fetch("blast_annot")
369 # and getProperty("blast_annot_content");
370 my ($agi, $ncbi_id);
371 if($blast =~ /^\s*AT[1-5MC]G/i){
372 ($agi) = $blast =~ /(AT[1-5MC]G\d{5}\.\d+)/i;
374 $blast =~ s/.*?\|.*?\|\s*//;
375 $blast =~ s/\s*\|.*//;
376 $blast = "[BLAST: $agi] $blast" if $agi;
378 elsif($blast =~ /^\s*gi\s*\|/i){
379 ($ncbi_id) = $blast =~ /\s*gi\s*\|\s*\d+\s*\|\s*\w+\s*\|\s*(\S+)\s*\|/i;
380 $blast =~ s/.*?\|.*?\|.*?\|.*?\|//;
381 $blast = "[BLAST: $ncbi_id] $blast" if $ncbi_id;
383 $blast = "[BLAST] $blast" unless($ncbi_id || $agi);
385 return $blast if $blast;
387 print STDERR "No annotation available for " . $self->{id};
388 return;
392 =head3 getDomains
394 Returns an array of hash references, one for each domain:
395 dom_desc => "NAME111"
396 dom_full_desc => "Actual description"
397 dom_start => start aa position
398 dom_end => end aa position
399 dom_interpro_id => intropro accession # for domain
401 =cut
403 sub getDomains {
404 my $self = shift;
405 $self->fetch_dom unless (ref $self->{domains} && @{$self->{domains}});
406 return @{$self->{domains}};
409 =head3 SignalP Getters
411 getSignalScore() - neural network score
412 getCleavagePosition() - AA # to the right of cleavage point
413 isSignalPositive() - 1 if predicted secretion, 0 if not
414 getSignalPeptide() - returns signal peptide sequence if positive, null string otherwise
415 getCleavedSequence() - returns protein sequence without signal peptide if signal positive,
416 otherwise returns a null string
418 =cut
420 sub getSignalScore {
421 my $self = shift;
422 $self->fetch_sigp unless $self->_check_key("nn_score");
423 return $self->property("nn_score");
426 sub getSignalPeptide {
427 my $self = shift;
428 $self->fetch_sigp unless $self->_check_key("nn_ypos");
429 my $protseq = $self->get_sequence("protein");
430 if($self->is_signal_positive){
431 my $cp = $self->get_cleavage_position();
432 return substr($protseq, 0, $cp-1);
434 else {
435 return '';
439 sub getCleavedSequence {
440 my $self = shift;
441 $self->fetch_sigp unless $self->_check_key("nn_ypos");
442 my $protseq = $self->get_sequence("protein");
443 if($self->is_signal_positive){
444 my $cp = $self->get_cleavage_position();
445 return substr($protseq, $cp-1);
447 else {
448 return '';
452 sub getCleavagePosition {
453 my $self = shift;
454 $self->fetch_sigp unless $self->_check_key("nn_ypos");
455 return $self->property("nn_ypos");
458 sub isSignalPositive {
459 my $self = shift;
460 $self->fetch_sigp unless $self->_check_key("nn_d");
461 my $dec = $self->property("nn_d");
462 if($dec eq 'Y'){
463 return 1;
465 elsif($dec eq 'N'){
466 return 0;
468 else {
469 #There is a serious problem with signalp formats that needs to be fixed immediately:
470 warn "\nSignalProblem: NN decision score ($dec) not recognized for " . $self->{id} . ", returning false\n";
471 return 0;
475 #Temporary sub aliases, get rid of that camel case! What was I thinking?
476 *get_property = \&getProperty;
477 *get_properties = \&getProperties;
478 *get_species = \&getSpecies;
479 *get_sequence = \&getSequence;
480 *get_annotation = \&getAnnotation;
481 *get_domains = \&getDomains;
482 *get_signal_score = \&getSignalScore;
483 *get_signal_peptide = \&getSignalPeptide;
484 *get_cleaved_sequence = \&getCleavedSequence;
485 *get_cleavage_position = \&getCleavagePosition;
486 *is_signal_positive = \&isSignalPositive;
488 sub _check_key {
489 my $self = shift;
490 my $key = shift;
491 unless(exists $self->{db_keys}->{$key}){
492 return 0;
494 return 1;
497 sub set_sql {
498 my $self = shift;
500 # __PACKAGE__->required_search_paths(qw/public sgn/);
502 $self->{queries} = {
504 #Arabidopsis Queries
506 ara_check =>
508 "SELECT agi FROM public.ara_properties WHERE agi=?",
510 ara_annot =>
512 "SELECT * FROM ara_properties JOIN ara_annotation USING(agi) WHERE agi=?",
514 ara_sigp =>
516 "SELECT * FROM ara_signalp WHERE agi=?",
518 ara_seq =>
520 "SELECT * FROM ara_sequence WHERE agi=?",
522 ara_domain =>
524 "SELECT dom_desc, dom_start, dom_end,
525 interpro_id AS dom_interpro_id,
526 interpro_dom AS dom_full_desc
527 FROM ara_domain
528 WHERE agi=?",
530 ara_ncbi_check =>
533 SELECT agi
534 FROM ara_annotation
535 WHERE
536 gb_prot_id=?
537 OR gb_mrna_id=?
540 ara_protein_length =>
542 "SELECT LENGTH(protein) AS protein_length FROM ara_sequence WHERE agi=?",
544 ara_go =>
547 SELECT type, go_id, evidence, content, pmid
548 FROM ara_go
549 WHERE
550 gene=?
553 #Unigene Queries
556 uni_domain =>
558 "SELECT cds_id AS dom_cds_id,
559 domain_accession,
560 match_begin AS dom_start,
561 match_end AS dom_end,
562 interpro_id AS dom_interpro_id,
563 description AS dom_desc
564 FROM domain_match
565 LEFT JOIN domain USING (domain_id)
566 WHERE
567 unigene_id=?
570 uni_check =>
572 "SELECT unigene_id FROM unigene WHERE unigene_id=?",
574 uni_seq =>
577 SELECT seq_edits, protein_seq
578 FROM cds
579 WHERE unigene_id=?
582 uni_sigp =>
585 SELECT * FROM signalp
586 WHERE
587 cds_id=
589 SELECT cds_id
590 FROM cds
591 WHERE
592 unigene_id=?
593 AND method='estscan'
597 uniq_manual_annot =>
600 SELECT
601 sgn_people.sp_person.first_name || ' ' || sgn_people.sp_person.last_name AS annot_contributor,
602 manual_annotations.date_entered AS annot_date_entered,
603 manual_annotations.last_modified AS annot_last_modified,
604 manual_annotations.annotation_text AS annot_content,
605 clone.clone_name AS annot_clone_name
606 FROM unigene
607 LEFT JOIN unigene_member USING (unigene_id)
608 LEFT JOIN est USING (est_id)
609 LEFT JOIN seqread USING (read_id)
610 LEFT JOIN clone USING (clone_id)
611 LEFT JOIN manual_annotations ON (clone.clone_id = manual_annotations.annotation_target_id)
612 LEFT JOIN sgn_people.sp_person ON (manual_annotations.author_id = sgn_people.sp_person.sp_person_id)
613 LEFT JOIN annotation_target_type ON (manual_annotations.annotation_target_type_id = annotation_target_type.annotation_target_type_id)
614 WHERE
615 unigene.unigene_id=?
616 AND annotation_target_type.type_name='clone'
619 uni_blast_annot =>
622 SELECT evalue AS blast_annot_evalue, score AS blast_annot_score,
623 identity_percentage AS blast_annot_identity_percentage,
624 defline AS blast_annot_content
625 FROM blast_hits
626 LEFT JOIN blast_defline USING (defline_id)
627 WHERE blast_annotation_id=
629 SELECT blast_annotation_id
630 FROM blast_annotations
631 LEFT JOIN blast_targets USING (blast_target_id)
632 WHERE apply_id=? and apply_type=15 LIMIT 1
634 ORDER BY score DESC
637 uni_general =>
639 "SELECT
640 unigene.cluster_no,
641 unigene.contig_no,
642 unigene.nr_members,
643 unigene.database_name,
644 unigene.sequence_name,
645 unigene_build.build_nr,
646 unigene_build.build_date,
647 unigene_build.status,
648 unigene_build.comment AS build_comment,
649 groups.comment AS species
650 FROM unigene
651 LEFT JOIN unigene_build USING (unigene_build_id)
652 LEFT JOIN groups ON (organism_group_id=group_id)
653 WHERE
654 unigene_id=?
657 uni_consensi =>
659 " SELECT seq
660 FROM unigene
661 LEFT JOIN unigene_consensi USING (consensi_id)
662 WHERE unigene_id=?
666 uni_singleton =>
668 " SELECT COALESCE(substring(seq FROM (hqi_start)::int+1 FOR (hqi_length)::int ),seq)
669 FROM unigene
670 LEFT JOIN unigene_member USING (unigene_id)
671 LEFT JOIN est USING (est_id)
672 LEFT JOIN qc_report USING (est_id)
673 WHERE
674 unigene.unigene_id=?
677 uni_protein_length =>
680 SELECT LENGTH(protein_seq) AS protein_length
681 FROM cds
682 WHERE
683 unigene_id=?
684 AND preferred='t'
691 while(my ($name, $query) = each %{$self->{queries}}){
692 $self->{query_handles}->{$name}=$self->get_dbh()->prepare($query);
696 sub get_sql {
697 my $self = shift;
698 my $name = shift;
699 return $self->{query_handles}->{$name};