change rules for cluster accessible dirs.
[cxgn-corelibs.git] / lib / CXGN / Tools / Sequence.pm
blob9b07b3632b5d4c24ddf507e3748cd1440e4984f6
1 package CXGN::Tools::Sequence;
2 use strict;
4 =head2 new()
6 Creates a new sequence object, which has a type, id, and sequence,
7 sent as an argument hash
8 CXGN::Tools::Sequence->new({ id=>..., seq=>..., type=>... })
10 Additionally, you may provide options for CDS translation to peptide:
11 strict_cds: If positive, don't allow mid-sequence stop codons or incomplete CDS's
12 allow_incomplete: CDS can be a fragment, not start w/ M, have a non-codon tail (ignored)
13 ignore_stop: Stop codons in the middle of the sequence are skipped over
15 If you are using strict mode, the translation will call die() whenever problems arise
17 =cut
19 sub new {
20 my $class = shift;
21 my $self = bless {}, $class;
22 my $args = shift;
23 $self->_default($args);
24 return $self;
27 sub _default {
28 my $self = shift;
29 my $args = shift;
31 $self->{strict_cds} = 0;
32 $self->{ignore_stop} = 1;
33 $self->{allow_incomplete} = 1;
34 my $random = int(rand(100000000));
35 $self->{id} = "Unknown-$random";
36 $self->{seq} = "";
37 $self->{type} = "nt";
39 if(ref($args) eq "HASH"){
40 foreach(qw/ type id seq
41 strict_cds ignore_stop allow_incomplete /
43 $self->{$_} = $args->{$_} if(exists $args->{$_});
46 else {
47 $self->{seq} = $args;
50 if($self->{strict_cds}){
51 $self->{ignore_stop} = 0;
52 $self->{allow_incomplete} = 0;
57 =head2 translate_cds()
59 Type must be set to "cds", otherwise returns with nothing.
60 Takes the sequence and translates it into a polypeptide, returning
61 the peptide sequence when finished and setting the hash key
62 $self->{translated_protein_seq}
64 =cut
66 sub translate_cds {
67 my $self = shift;
68 unless($self->{type} eq "cds") {
69 print STDERR "Type not set to 'cds', early return";
70 return;
72 my $seq = $self->{seq};
73 my $cds_seq = $seq;
74 my $protein_seq = "";
75 my $i = 0;
76 while($i < length($cds_seq)){
77 my $codon = substr($cds_seq, $i, 3);
78 my $aa = "";
79 $aa = $self->_translate_codon($codon, $i);
80 if(!$self->{allow_incomplete} && $i==0 && $aa ne "M"){
81 die "Polypeptide does not start with a Methionine, for the sequence identified by " . $self->{id} . "\n";
83 if($aa eq "*"){
84 $i+=3;
85 $protein_seq .= "X" if $self->{ignore_stop}; #assuming that cds is wrong
86 $protein_seq .= "*" if !$self->{ignore_stop};
87 last unless $self->{ignore_stop};
89 else {
90 $protein_seq .= $aa;
91 $i+=3;
94 unless ($self->{ignore_stop} || $i >= length($cds_seq)){
95 my $stop_codon = substr($cds_seq, $i, 3);
96 my $next_codon = substr($cds_seq, $i+3, 3);
97 die "Early termination of CDS for the sequence identified by " . $self->{id} . "\n$stop_codon-$next_codon... position $i short of " . length($cds_seq);
100 $self->{translated_protein_seq} = $protein_seq;
101 return $protein_seq;
104 sub _translate_codon {
105 #Ridiculous and Unnecessary? Maybe, but I wanted to do it,
106 #and Bio::Tools::CodonTable requires overhead
107 my $self = shift;
108 my $codon = uc ( shift );
109 my $pos = 1 + ( shift );
110 $codon =~ tr/U/T/;
111 if(length($codon)<3){
112 die "Not a complete codon" unless $self->{allow_incomplete};
113 return "X"; #consider revising
115 if(length($codon)>3){
116 die "Codon more than 3 base pairs long. Fix this bug!\n";
118 return "-" if $codon =~ /^(-|\.){3}$/;
120 die "Invalid codon: $codon at position $pos (". $self->{id} . ")"
121 unless $codon =~ /^(A|T|G|C|N|R|Y|S|W|K|M|B|D|H|V|X)+$/;
123 return "*" if $codon =~ /(TA((G|A)|R))|(TGA)/; #STOP codon
124 return "M" if $codon =~ /ATG/;
125 return "I" if $codon =~ /AT((T|C|A)|H|Y|M|W)/;
126 return "L" if $codon =~ /(CT.)|(TT((A|G)|R))/;
127 return "V" if $codon =~ /GT./;
128 return "F" if $codon =~ /TT((T|C)|Y)/;
129 return "C" if $codon =~ /TG((T|C)|Y)/;
130 return "A" if $codon =~ /GC./;
131 return "G" if $codon =~ /GG./;
132 return "P" if $codon =~ /CC./;
133 return "T" if $codon =~ /AC./;
134 return "S" if $codon =~ /(TC.)|(AG((T|C)|Y))/;
135 return "Y" if $codon =~ /TA((T|C)|Y)/;
136 return "W" if $codon =~ /TGG/;
137 return "Q" if $codon =~ /CA((A|G)|R)/;
138 return "N" if $codon =~ /AA((T|C)|Y)/;
139 return "H" if $codon =~ /CA((T|C)|Y)/;
140 return "E" if $codon =~ /GA((A|G)|R)/;
141 return "D" if $codon =~ /GA((T|C)|Y)/;
142 return "K" if $codon =~ /AA((A|G)|R)/;
143 return "R" if $codon =~ /(CG.)|(AG((A|G)|R))/;
145 return "X"; #unknown
148 =head3 cds_insert_gaps
150 Type must be 'cds', use this function if you have an
151 un-gapped cds and would like to gap-it, given a gapped
152 protein sequence returned by the alignment program.
154 Arguments: A gapped *protein* sequence
155 Returns: Nothing
156 Side Effects: Gaps up the current CDS. Also, I recommend
157 setting 'cds_nocheck' in the constructor, as this will
158 run MUCH more quickly if you already know that the
159 protein sequence and CDS are properly translated.
161 =cut
163 sub cds_insert_gaps {
164 my $self = shift;
165 unless($self->{type} eq "cds"){
166 print STDERR "Type not set to 'cds', early return\n";
167 return;
169 my $gapped_prot_seq = shift;
171 my $ungap_prot = $gapped_prot_seq;
172 $ungap_prot =~ s/\W//g;
173 unless(length($self->{seq}) <= (3*length($ungap_prot)+3)) {
174 my $message = "Error: Protein length (" . length($ungap_prot) .
175 ") must be less than or equal to 1/3 of CDS length ( " .
176 length($self->{seq}) .
177 " minus stop codon) for gapping to work. Early return.\n";
178 print STDERR $message;
179 return;
182 my $revprot = reverse $gapped_prot_seq;
183 my $cds_gapped = "";
184 my $cds_pos = 0;
185 while(my $aa = chop($revprot)){
186 if($aa eq '-'){
187 $cds_gapped .= "---";
189 else {
190 my $codon = substr($self->{seq}, $cds_pos, 3);
191 die "Protein seq does not match CDS at position $cds_pos (" . $self->{id} . ")\n" unless ($self->{cds_nocheck} || $aa == ($self->_translate_codon($codon)) );
192 $cds_gapped .= $codon;
193 $cds_pos+=3;
196 $self->{seq} = $cds_gapped;
197 return $cds_gapped;
200 =head2 use_liberal_cds()
202 Mid-sequence stop codons ignored, replaced w/ 'X'
203 Incomplete CDS is allowed, assuming frame starts on
204 first character.
206 =cut
208 sub use_liberal_cds {
209 my $self = shift;
210 $self->{strict_cds} = 0;
211 $self->{ignore_stop} = 1;
212 $self->{allow_incomplete} = 1;
215 =head2 use_strict_cds
217 CDS must be complete (length divisible by three),
218 starting with a methionine.
219 STOP codons mid-sequence will cause death.
221 =cut
223 sub use_strict_cds {
224 my $self = shift;
225 $self->{strict_cds} = 1;
226 $self->{ignore_stop} = 0;
227 $self->{allow_incomplete} = 0;