4 CXGN::Transcript::CDS - a class that deals with coding sequence associated with unigenes.
8 Stores predicted CDS and protein sequences from unigene data. The two methods that are currently used are ESTScan and a simple longest 6 frame translation.
10 The ESTScan CDS predictions contain edits. Insertions are reflected by inserted X's into the sequence, while removed nucleotides are represented by lower case letters in an uppercase background. This sequence is stored in the seq_text field. The actual cds sequence with the lower case letters removed is stored in the seq_edits table.
12 In general, the correct cds sequence is stored into cds.cds_seq column, and is available through the get_cds_seq accessor.
13 The corresponding protein sequence is available through get_protein_seq. Note that ESTScan has a bug in the way it calculates the protein (ignores N's in the nucleotide sequence); therefore, protein sequences need to be calculated using the the cds_translate.pl script in /sgn-tools/unigene.
15 The longest 6-frame translations can be generated using the get_longest_protein.pl script in /sgn-tools/protein. It returns both the cds and the protein sequence in two separate fasta files.
20 Lukas Mueller <lam87@cornell.edu>
24 reformat_unigene_sequence_with_edit() added 06/23/08 by Mallory Freeberg
28 This class defines the following methods:
34 package CXGN
::Transcript
::CDS
;
36 use base qw
| CXGN
::DB
::Object
|;
37 use base qw
| Bio
::Seq
|;
38 use base qw
| CXGN
::Transcript
::Unigene
|;
55 my $self = $class->SUPER::new
($dbh);
57 $self->set_cds_id($id);
63 =head2 new_with_unigene_id
74 sub new_with_unigene_id
{
77 my $unigene_id = shift;
78 my $query = "SELECT cds_id FROM cds WHERE unigene_id = ?";
79 my $sth = $dbh->prepare($query);
80 $sth->execute($unigene_id);
81 my ($cds_id) = $sth->fetchrow_array();
83 my $self = CXGN
::Transcript
::CDS
->new($dbh, $cds_id);
101 my $unigene_id = shift;
104 my $query = "SELECT cds_id FROM sgn.cds WHERE unigene_id=? and type=?";
106 my $sth = $dbh->prepare($query);
107 $sth->execute($unigene_id, $type);
110 while (my ($cds_id) = $sth->fetchrow_array()) {
111 push @cds_ids, $cds_id;
132 my $query = "SELECT cds_id, unigene_id, protein_feature_id, seq_text, seq_edits, cds_seq, protein_seq, cds.\"begin\", cds.\"end\", cds.forward_reverse, frame, cds.run_id, cds.score, method, preferred FROM sgn.cds WHERE cds_id=?";
133 my $sth = $self->get_dbh()->prepare($query);
134 $sth->execute($self->get_cds_id());
135 my ($cds_id, $unigene_id, $protein_feature_id, $seq_text, $seq_edits, $cds_seq, $protein_seq, $begin, $end, $forward_reverse, $frame, $run_id, $score, $method, $preferred) =
136 $sth->fetchrow_array();
138 $self->set_cds_id($cds_id);
139 $self->set_unigene_id($unigene_id);
140 $self->set_protein_feature_id($protein_feature_id);
141 $self->set_cds_seq($cds_seq);
142 $self->set_seq_text($seq_text);
143 $self->set_seq_edits($seq_edits);
144 $self->set_protein_seq($protein_seq);
145 $self->set_begin($begin);
146 $self->set_end($end);
147 $self->set_direction($forward_reverse);
148 $self->set_frame($frame);
149 $self->set_run_id($run_id);
150 $self->set_score($score);
151 $self->set_method($method);
152 $self->set_preferred($preferred);
159 if ($self->get_cds_id()) {
160 my $query = "UPDATE sgn.cds set
161 unigene_id=?, protein_feature_id=?, seq_text=?, seq_edits=?, cds_seq=?, protein_seq=?, \"begin\"=?, \"end\"=?, forward_reverse=?, frame=?, run_id=?, score=?, method=?, preferred=?
163 my $sth = $self->get_dbh()->prepare($query);
165 $self->get_unigene_id(),
166 $self->get_protein_feature_id(),
167 $self->get_seq_text(),
168 $self->get_seq_edits(),
169 $self->get_cds_seq(),
170 $self->get_protein_seq(),
173 $self->get_direction(),
178 $self->get_preferred(),
186 my $query = "INSERT INTO sgn.cds (
187 unigene_id, protein_feature_id, seq_text, seq_edits, cds_seq, protein_seq, \"begin\", \"end\", forward_reverse, frame, run_id, score, method, preferred) VALUES (?, ?,?, ?, ?, ?, ?, ?, ?, ?,?, ?,?,? )";
188 my $sth = $self->get_dbh()->prepare($query);
190 $self->get_unigene_id(),
191 $self->get_protein_feature_id(),
192 $self->get_seq_text(),
193 $self->get_seq_edits(),
194 $self->get_cds_seq(),
195 $self->get_protein_seq(),
198 $self->get_direction(),
203 $self->get_preferred()
205 return $self->get_currval("sgn.cds_cds_id_seq");
210 =head2 accessors get_cds_id, set_cds_id
213 Property: the primary key of the table.
219 return $self->{cds_id
};
225 $self->{cds_id
}=shift;
228 =head2 accessors get_unigene_id, set_unigene_id
231 Property: foreign key to unigene. The unigene this
232 cds entry is associated with.
242 return $self->{unigene_id
};
248 $self->{unigene_id
}=shift;
251 =head2 accessors get_protein_feature_id, set_protein_feature_id
261 sub get_protein_feature_id
{
263 return $self->{protein_feature_id
};
266 sub set_protein_feature_id
{
268 $self->{protein_feature_id
} = shift;
272 =head2 accessors get_seq_text, set_seq_text
274 Usage: my $seq_text = $cds->get_seq_text()
275 Desc: gets the sequence text predicted by ESTScan.
276 BIG CAVEAT: this sequence contains the nucleotides
277 that ESTScan removed in lower case, while everything
278 else is in uppercase. Use get_cds_seq() to get the
279 correct cds sequence irrespective of method used.
287 return $self->{seq_text
};
293 $self->{seq_text
}=shift;
296 =head2 accessors get_cds_seq, set_cds_seq
298 Usage: my $cds_seq = $cds->get_cds_seq
299 Desc: gets the cds sequence
300 Side Effects: property usually set in constructor...
301 the value is populated from the cds_seq field
302 of the sgn.cds database table. This field contains
303 the correct cds sequence irregardless of the method
311 return $self->{cds_seq
};
318 $self->{cds_seq
}=shift;
323 =head2 accessors get_seq_edits, set_seq_edits
325 Usage: my $seq_edits = $cds->get_seq_edits
326 Desc: gets the sequence that ESTScan predicted
327 Ret: with the lower case letter removed.
328 To access the cds irregardless of method used,
329 use the get_cds_seq() accessor.
338 return $self->{seq_edits
};
344 $self->{seq_edits
}=shift;
347 =head2 accessors get_protein_seq, set_protein_seq
358 sub get_protein_seq
{
360 return $self->{protein_seq
};
364 sub set_protein_seq
{
366 $self->{protein_seq
}=shift;
369 =head2 accessors get_begin, set_begin
382 return $self->{begin
};
388 $self->{begin
}=shift;
391 =head2 accessors get_end, set_end
413 # =head2 accessors get_forward_reverse, set_forward_reverse
424 # sub get_forward_reverse {
426 # return $self->{forward_reverse};
430 # sub set_forward_reverse {
432 # $self->{forward_reverse}=shift;
438 Usage: my $direction = $cds->get_direction()
439 Desc: gets the direction of the cds relative to the
440 unigene. Either "F" for forward or "R" for
442 Side Effects: this property is set in the constructor.
443 it maps to the somewhat cumbersomly named
444 forward_reverse column in the database.
451 return $self->{direction
};
457 $self->{direction
}=shift;
460 =head2 accessors get_frame, set_frame
473 return $self->{frame
};
480 $self->{frame
}=shift;
485 =head2 accessors get_run_id, set_run_id
498 return $self->{run_id
};
504 $self->{run_id
}=shift;
507 =head2 accessors get_score, set_score
520 return $self->{score
};
526 $self->{score
}=shift;
529 # =head2 accessors get_longest_frame_translation, set_longest_frame_translation
540 # sub get_longest_frame_translation {
542 # return $self->{longest_frame_translation};
546 # sub set_longest_frame_translation {
548 # $self->{longest_frame_translation}=shift;
551 # =head2 accessors get_longest_frame, set_longest_frame
562 # sub get_longest_frame {
564 # return $self->{longest_frame};
568 # sub set_longest_frame {
570 # $self->{longest_frame}=shift;
573 # =head2 function get_proteins
586 # my $sgn = $self->get_dbh()->qualify_schema("sgn");
587 # my $query = "SELECT protein_id FROM $sgn.protein WHERE cds_id=?";
588 # my $sth = $self->get_dbh()->prepare($query);
589 # $sth->execute($self->get_cds_id());
591 # while (my ($protein_id) = $sth->fetchrow_array()) {
592 # push @proteins, CXGN::Transcript::Protein->new($self->get_dbh(), $protein_id);
598 =head2 accessors get_method, set_method
600 Usage: my $method = $cds->get_method()
601 Desc: the method used to predict the cds/protein seq
602 Ret: either "estscan" or "longest6frame"
609 return $self->{method
};
615 $self->{method
}=shift;
633 return $self->{preferred
};
650 my $preferred = shift;
651 if ($preferred=~/t/i) {
654 if ($preferred=~/f/i) {
657 $self->{preferred
}=$preferred;
660 =head2 get_signalp_info
662 Usage: my ($nn_ypos, $nn_score, $nn_d) = $unigene->get_signalp_info()
671 sub get_signalp_info
{
673 my $query = "SELECT nn_ypos, nn_score, nn_d
676 my $sth = $self->get_dbh()->prepare($query);
677 $sth->execute($self->get_cds_id());
678 my ($nn_ypos, $nn_score, $nn_d) = $sth->fetchrow_array();
679 return ($nn_d, $nn_ypos, $nn_score);
683 =head2 function get_interpro_domains
694 sub get_interpro_domains
{
696 my $query = "SELECT interpro_accession, interpro.description, match_begin, match_end
698 LEFT JOIN sgn.domain USING (interpro_id)
699 LEFT JOIN sgn.domain_match USING (domain_id)
701 AND hit_status LIKE 'T' ";
702 my $sth = $self->get_dbh()->prepare_cached($query);
703 $sth->execute($self->get_cds_id());
704 my @interpro_domain_list = ();
705 while (my ($interpro_accession, $description, $match_begin, $match_end)= $sth->fetchrow_array()) {
706 push @interpro_domain_list, [$interpro_accession,$description, $match_begin, $match_end];
708 return @interpro_domain_list;