Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / lib / CXGN / Transcript / CDS.pm
blobc495fba3d9950e2979e7a4832d2cf2009567d18c
2 =head1 NAME
4 CXGN::Transcript::CDS - a class that deals with coding sequence associated with unigenes.
6 =head1 DESCRIPTION
8 Stores predicted CDS and protein sequences from unigene data. The two methods that are currently used are ESTScan and a simple longest 6 frame translation.
10 The ESTScan CDS predictions contain edits. Insertions are reflected by inserted X's into the sequence, while removed nucleotides are represented by lower case letters in an uppercase background. This sequence is stored in the seq_text field. The actual cds sequence with the lower case letters removed is stored in the seq_edits table.
12 In general, the correct cds sequence is stored into cds.cds_seq column, and is available through the get_cds_seq accessor.
13 The corresponding protein sequence is available through get_protein_seq. Note that ESTScan has a bug in the way it calculates the protein (ignores N's in the nucleotide sequence); therefore, protein sequences need to be calculated using the the cds_translate.pl script in /sgn-tools/unigene.
15 The longest 6-frame translations can be generated using the get_longest_protein.pl script in /sgn-tools/protein. It returns both the cds and the protein sequence in two separate fasta files.
18 =head1 AUTHOR
20 Lukas Mueller <lam87@cornell.edu>
22 =head1 EDITS
24 reformat_unigene_sequence_with_edit() added 06/23/08 by Mallory Freeberg
26 =head1 METHODS
28 This class defines the following methods:
30 =cut
32 use strict;
34 package CXGN::Transcript::CDS;
36 use base qw | CXGN::DB::Object |;
37 use base qw | Bio::Seq |;
38 use base qw | CXGN::Transcript::Unigene |;
40 =head2 new
42 Usage:
43 Desc:
44 Ret:
45 Args:
46 Side Effects:
47 Example:
49 =cut
51 sub new {
52 my $class = shift;
53 my $dbh = shift;
54 my $id = shift;
55 my $self = $class->SUPER::new($dbh);
56 if ($id) {
57 $self->set_cds_id($id);
58 $self->fetch();
60 return $self;
63 =head2 new_with_unigene_id
65 Usage:
66 Desc:
67 Ret:
68 Args:
69 Side Effects:
70 Example:
72 =cut
74 sub new_with_unigene_id {
75 my $class = shift;
76 my $dbh = shift;
77 my $unigene_id = shift;
78 my $query = "SELECT cds_id FROM cds WHERE unigene_id = ?";
79 my $sth = $dbh->prepare($query);
80 $sth->execute($unigene_id);
81 my ($cds_id) = $sth->fetchrow_array();
83 my $self = CXGN::Transcript::CDS->new($dbh, $cds_id);
84 return $self;
88 =head2 exists
90 Usage:
91 Desc:
92 Ret:
93 Args:
94 Side Effects:
95 Example:
97 =cut
99 sub exists {
100 my $dbh = shift;
101 my $unigene_id = shift;
102 my $type = shift;
104 my $query = "SELECT cds_id FROM sgn.cds WHERE unigene_id=? and type=?";
106 my $sth = $dbh->prepare($query);
107 $sth->execute($unigene_id, $type);
109 my @cds_ids = ();
110 while (my ($cds_id) = $sth->fetchrow_array()) {
111 push @cds_ids, $cds_id;
113 return @cds_ids;
119 =head2 fetch
121 Usage:
122 Desc:
123 Ret:
124 Args:
125 Side Effects:
126 Example:
128 =cut
130 sub fetch {
131 my $self = shift;
132 my $query = "SELECT cds_id, unigene_id, protein_feature_id, seq_text, seq_edits, cds_seq, protein_seq, cds.\"begin\", cds.\"end\", cds.forward_reverse, frame, cds.run_id, cds.score, method, preferred FROM sgn.cds WHERE cds_id=?";
133 my $sth = $self->get_dbh()->prepare($query);
134 $sth->execute($self->get_cds_id());
135 my ($cds_id, $unigene_id, $protein_feature_id, $seq_text, $seq_edits, $cds_seq, $protein_seq, $begin, $end, $forward_reverse, $frame, $run_id, $score, $method, $preferred) =
136 $sth->fetchrow_array();
138 $self->set_cds_id($cds_id);
139 $self->set_unigene_id($unigene_id);
140 $self->set_protein_feature_id($protein_feature_id);
141 $self->set_cds_seq($cds_seq);
142 $self->set_seq_text($seq_text);
143 $self->set_seq_edits($seq_edits);
144 $self->set_protein_seq($protein_seq);
145 $self->set_begin($begin);
146 $self->set_end($end);
147 $self->set_direction($forward_reverse);
148 $self->set_frame($frame);
149 $self->set_run_id($run_id);
150 $self->set_score($score);
151 $self->set_method($method);
152 $self->set_preferred($preferred);
154 return $cds_id;
157 sub store {
158 my $self = shift;
159 if ($self->get_cds_id()) {
160 my $query = "UPDATE sgn.cds set
161 unigene_id=?, protein_feature_id=?, seq_text=?, seq_edits=?, cds_seq=?, protein_seq=?, \"begin\"=?, \"end\"=?, forward_reverse=?, frame=?, run_id=?, score=?, method=?, preferred=?
162 WHERE cds_id=?";
163 my $sth = $self->get_dbh()->prepare($query);
164 $sth->execute(
165 $self->get_unigene_id(),
166 $self->get_protein_feature_id(),
167 $self->get_seq_text(),
168 $self->get_seq_edits(),
169 $self->get_cds_seq(),
170 $self->get_protein_seq(),
171 $self->get_begin(),
172 $self->get_end(),
173 $self->get_direction(),
174 $self->get_frame(),
175 $self->get_run_id(),
176 $self->get_score(),
177 $self->get_method(),
178 $self->get_preferred(),
179 $self->get_cds_id()
185 else {
186 my $query = "INSERT INTO sgn.cds (
187 unigene_id, protein_feature_id, seq_text, seq_edits, cds_seq, protein_seq, \"begin\", \"end\", forward_reverse, frame, run_id, score, method, preferred) VALUES (?, ?,?, ?, ?, ?, ?, ?, ?, ?,?, ?,?,? )";
188 my $sth = $self->get_dbh()->prepare($query);
189 $sth->execute(
190 $self->get_unigene_id(),
191 $self->get_protein_feature_id(),
192 $self->get_seq_text(),
193 $self->get_seq_edits(),
194 $self->get_cds_seq(),
195 $self->get_protein_seq(),
196 $self->get_begin(),
197 $self->get_end(),
198 $self->get_direction(),
199 $self->get_frame(),
200 $self->get_run_id(),
201 $self->get_score(),
202 $self->get_method(),
203 $self->get_preferred()
205 return $self->get_currval("sgn.cds_cds_id_seq");
210 =head2 accessors get_cds_id, set_cds_id
212 Usage:
213 Property: the primary key of the table.
215 =cut
217 sub get_cds_id {
218 my $self=shift;
219 return $self->{cds_id};
223 sub set_cds_id {
224 my $self=shift;
225 $self->{cds_id}=shift;
228 =head2 accessors get_unigene_id, set_unigene_id
230 Usage:
231 Property: foreign key to unigene. The unigene this
232 cds entry is associated with.
233 Ret:
234 Args:
235 Side Effects:
236 Example:
238 =cut
240 sub get_unigene_id {
241 my $self=shift;
242 return $self->{unigene_id};
246 sub set_unigene_id {
247 my $self=shift;
248 $self->{unigene_id}=shift;
251 =head2 accessors get_protein_feature_id, set_protein_feature_id
253 Usage:
254 Desc:
255 Property
256 Side Effects:
257 Example:
259 =cut
261 sub get_protein_feature_id {
262 my $self = shift;
263 return $self->{protein_feature_id};
266 sub set_protein_feature_id {
267 my $self = shift;
268 $self->{protein_feature_id} = shift;
272 =head2 accessors get_seq_text, set_seq_text
274 Usage: my $seq_text = $cds->get_seq_text()
275 Desc: gets the sequence text predicted by ESTScan.
276 BIG CAVEAT: this sequence contains the nucleotides
277 that ESTScan removed in lower case, while everything
278 else is in uppercase. Use get_cds_seq() to get the
279 correct cds sequence irrespective of method used.
280 Side Effects:
281 Example:
283 =cut
285 sub get_seq_text {
286 my $self=shift;
287 return $self->{seq_text};
291 sub set_seq_text {
292 my $self=shift;
293 $self->{seq_text}=shift;
296 =head2 accessors get_cds_seq, set_cds_seq
298 Usage: my $cds_seq = $cds->get_cds_seq
299 Desc: gets the cds sequence
300 Side Effects: property usually set in constructor...
301 the value is populated from the cds_seq field
302 of the sgn.cds database table. This field contains
303 the correct cds sequence irregardless of the method
304 used.
305 Example:
307 =cut
309 sub get_cds_seq {
310 my $self=shift;
311 return $self->{cds_seq};
316 sub set_cds_seq {
317 my $self=shift;
318 $self->{cds_seq}=shift;
323 =head2 accessors get_seq_edits, set_seq_edits
325 Usage: my $seq_edits = $cds->get_seq_edits
326 Desc: gets the sequence that ESTScan predicted
327 Ret: with the lower case letter removed.
328 To access the cds irregardless of method used,
329 use the get_cds_seq() accessor.
330 Args:
331 Side Effects:
332 Example:
334 =cut
336 sub get_seq_edits {
337 my $self=shift;
338 return $self->{seq_edits};
342 sub set_seq_edits {
343 my $self=shift;
344 $self->{seq_edits}=shift;
347 =head2 accessors get_protein_seq, set_protein_seq
349 Usage:
350 Desc:
351 Ret:
352 Args:
353 Side Effects:
354 Example:
356 =cut
358 sub get_protein_seq {
359 my $self=shift;
360 return $self->{protein_seq};
364 sub set_protein_seq {
365 my $self=shift;
366 $self->{protein_seq}=shift;
369 =head2 accessors get_begin, set_begin
371 Usage:
372 Desc:
373 Ret:
374 Args:
375 Side Effects:
376 Example:
378 =cut
380 sub get_begin {
381 my $self=shift;
382 return $self->{begin};
386 sub set_begin {
387 my $self=shift;
388 $self->{begin}=shift;
391 =head2 accessors get_end, set_end
393 Usage:
394 Desc:
395 Ret:
396 Args:
397 Side Effects:
398 Example:
400 =cut
402 sub get_end {
403 my $self=shift;
404 return $self->{end};
408 sub set_end {
409 my $self=shift;
410 $self->{end}=shift;
413 # =head2 accessors get_forward_reverse, set_forward_reverse
415 # Usage:
416 # Desc:
417 # Ret:
418 # Args:
419 # Side Effects:
420 # Example:
422 # =cut
424 # sub get_forward_reverse {
425 # my $self=shift;
426 # return $self->{forward_reverse};
430 # sub set_forward_reverse {
431 # my $self=shift;
432 # $self->{forward_reverse}=shift;
436 =head2 get_direction
438 Usage: my $direction = $cds->get_direction()
439 Desc: gets the direction of the cds relative to the
440 unigene. Either "F" for forward or "R" for
441 reverse.
442 Side Effects: this property is set in the constructor.
443 it maps to the somewhat cumbersomly named
444 forward_reverse column in the database.
445 Example:
447 =cut
449 sub get_direction {
450 my $self=shift;
451 return $self->{direction};
455 sub set_direction {
456 my $self=shift;
457 $self->{direction}=shift;
460 =head2 accessors get_frame, set_frame
462 Usage:
463 Desc:
464 Ret:
465 Args:
466 Side Effects:
467 Example:
469 =cut
471 sub get_frame {
472 my $self=shift;
473 return $self->{frame};
478 sub set_frame {
479 my $self=shift;
480 $self->{frame}=shift;
485 =head2 accessors get_run_id, set_run_id
487 Usage:
488 Desc:
489 Ret:
490 Args:
491 Side Effects:
492 Example:
494 =cut
496 sub get_run_id {
497 my $self=shift;
498 return $self->{run_id};
502 sub set_run_id {
503 my $self=shift;
504 $self->{run_id}=shift;
507 =head2 accessors get_score, set_score
509 Usage:
510 Desc:
511 Ret:
512 Args:
513 Side Effects:
514 Example:
516 =cut
518 sub get_score {
519 my $self=shift;
520 return $self->{score};
524 sub set_score {
525 my $self=shift;
526 $self->{score}=shift;
529 # =head2 accessors get_longest_frame_translation, set_longest_frame_translation
531 # Usage:
532 # Desc:
533 # Ret:
534 # Args:
535 # Side Effects:
536 # Example:
538 # =cut
540 # sub get_longest_frame_translation {
541 # my $self=shift;
542 # return $self->{longest_frame_translation};
546 # sub set_longest_frame_translation {
547 # my $self=shift;
548 # $self->{longest_frame_translation}=shift;
551 # =head2 accessors get_longest_frame, set_longest_frame
553 # Usage:
554 # Desc:
555 # Ret:
556 # Args:
557 # Side Effects:
558 # Example:
560 # =cut
562 # sub get_longest_frame {
563 # my $self=shift;
564 # return $self->{longest_frame};
568 # sub set_longest_frame {
569 # my $self=shift;
570 # $self->{longest_frame}=shift;
573 # =head2 function get_proteins
575 # Usage:
576 # Desc:
577 # Ret:
578 # Args:
579 # Side Effects:
580 # Example:
582 # =cut
584 # sub get_proteins {
585 # my $self = shift;
586 # my $sgn = $self->get_dbh()->qualify_schema("sgn");
587 # my $query = "SELECT protein_id FROM $sgn.protein WHERE cds_id=?";
588 # my $sth = $self->get_dbh()->prepare($query);
589 # $sth->execute($self->get_cds_id());
590 # my @proteins = ();
591 # while (my ($protein_id) = $sth->fetchrow_array()) {
592 # push @proteins, CXGN::Transcript::Protein->new($self->get_dbh(), $protein_id);
594 # return @proteins;
598 =head2 accessors get_method, set_method
600 Usage: my $method = $cds->get_method()
601 Desc: the method used to predict the cds/protein seq
602 Ret: either "estscan" or "longest6frame"
603 Args: none
605 =cut
607 sub get_method {
608 my $self=shift;
609 return $self->{method};
613 sub set_method {
614 my $self=shift;
615 $self->{method}=shift;
619 =head2 get_preferred
621 Usage:
622 Desc:
623 Ret:
624 Args:
625 Side Effects:
626 Example:
628 =cut
630 sub get_preferred {
631 my $self=shift;
633 return $self->{preferred};
637 =head2 set_preferred
639 Usage:
640 Desc:
641 Ret:
642 Args:
643 Side Effects:
644 Example:
646 =cut
648 sub set_preferred {
649 my $self=shift;
650 my $preferred = shift;
651 if ($preferred=~/t/i) {
652 $preferred = 1;
654 if ($preferred=~/f/i) {
655 $preferred= 0;
657 $self->{preferred}=$preferred;
660 =head2 get_signalp_info
662 Usage: my ($nn_ypos, $nn_score, $nn_d) = $unigene->get_signalp_info()
663 Desc:
664 Ret:
665 Args:
666 Side Effects:
667 Example:
669 =cut
671 sub get_signalp_info {
672 my $self = shift;
673 my $query = "SELECT nn_ypos, nn_score, nn_d
674 FROM public.signalp
675 WHERE cds_id=? ";
676 my $sth = $self->get_dbh()->prepare($query);
677 $sth->execute($self->get_cds_id());
678 my ($nn_ypos, $nn_score, $nn_d) = $sth->fetchrow_array();
679 return ($nn_d, $nn_ypos, $nn_score);
683 =head2 function get_interpro_domains
685 Usage:
686 Desc:
687 Ret:
688 Args:
689 Side Effects:
690 Example:
692 =cut
694 sub get_interpro_domains {
695 my $self = shift;
696 my $query = "SELECT interpro_accession, interpro.description, match_begin, match_end
697 FROM sgn.interpro
698 LEFT JOIN sgn.domain USING (interpro_id)
699 LEFT JOIN sgn.domain_match USING (domain_id)
700 WHERE cds_id = ?
701 AND hit_status LIKE 'T' ";
702 my $sth = $self->get_dbh()->prepare_cached($query);
703 $sth->execute($self->get_cds_id());
704 my @interpro_domain_list = ();
705 while (my ($interpro_accession, $description, $match_begin, $match_end)= $sth->fetchrow_array()) {
706 push @interpro_domain_list, [$interpro_accession,$description, $match_begin, $match_end];
708 return @interpro_domain_list;
712 return 1;