1 package CXGN
::Tools
::Sequence
;
6 Creates a new sequence object, which has a type, id, and sequence,
7 sent as an argument hash
8 CXGN::Tools::Sequence->new({ id=>..., seq=>..., type=>... })
10 Additionally, you may provide options for CDS translation to peptide:
11 strict_cds: If positive, don't allow mid-sequence stop codons or incomplete CDS's
12 allow_incomplete: CDS can be a fragment, not start w/ M, have a non-codon tail (ignored)
13 ignore_stop: Stop codons in the middle of the sequence are skipped over
15 If you are using strict mode, the translation will call die() whenever problems arise
21 my $self = bless {}, $class;
23 $self->_default($args);
31 $self->{strict_cds
} = 0;
32 $self->{ignore_stop
} = 1;
33 $self->{allow_incomplete
} = 1;
34 my $random = int(rand(100000000));
35 $self->{id
} = "Unknown-$random";
39 if(ref($args) eq "HASH"){
40 foreach(qw
/ type id seq
41 strict_cds ignore_stop allow_incomplete
/
43 $self->{$_} = $args->{$_} if(exists $args->{$_});
50 if($self->{strict_cds
}){
51 $self->{ignore_stop
} = 0;
52 $self->{allow_incomplete
} = 0;
57 =head2 translate_cds()
59 Type must be set to "cds", otherwise returns with nothing.
60 Takes the sequence and translates it into a polypeptide, returning
61 the peptide sequence when finished and setting the hash key
62 $self->{translated_protein_seq}
68 unless($self->{type
} eq "cds") {
69 print STDERR
"Type not set to 'cds', early return";
72 my $seq = $self->{seq
};
76 while($i < length($cds_seq)){
77 my $codon = substr($cds_seq, $i, 3);
79 $aa = $self->_translate_codon($codon, $i);
80 if(!$self->{allow_incomplete
} && $i==0 && $aa ne "M"){
81 die "Polypeptide does not start with a Methionine, for the sequence identified by " . $self->{id
} . "\n";
85 $protein_seq .= "X" if $self->{ignore_stop
}; #assuming that cds is wrong
86 $protein_seq .= "*" if !$self->{ignore_stop
};
87 last unless $self->{ignore_stop
};
94 unless ($self->{ignore_stop
} || $i >= length($cds_seq)){
95 my $stop_codon = substr($cds_seq, $i, 3);
96 my $next_codon = substr($cds_seq, $i+3, 3);
97 die "Early termination of CDS for the sequence identified by " . $self->{id
} . "\n$stop_codon-$next_codon... position $i short of " . length($cds_seq);
100 $self->{translated_protein_seq
} = $protein_seq;
104 sub _translate_codon
{
105 #Ridiculous and Unnecessary? Maybe, but I wanted to do it,
106 #and Bio::Tools::CodonTable requires overhead
108 my $codon = uc ( shift );
109 my $pos = 1 + ( shift );
111 if(length($codon)<3){
112 die "Not a complete codon" unless $self->{allow_incomplete
};
113 return "X"; #consider revising
115 if(length($codon)>3){
116 die "Codon more than 3 base pairs long. Fix this bug!\n";
118 return "-" if $codon =~ /^(-|\.){3}$/;
120 die "Invalid codon: $codon at position $pos (". $self->{id
} . ")"
121 unless $codon =~ /^(A|T|G|C|N|R|Y|S|W|K|M|B|D|H|V|X)+$/;
123 return "*" if $codon =~ /(TA((G|A)|R))|(TGA)/; #STOP codon
124 return "M" if $codon =~ /ATG/;
125 return "I" if $codon =~ /AT((T|C|A)|H|Y|M|W)/;
126 return "L" if $codon =~ /(CT.)|(TT((A|G)|R))/;
127 return "V" if $codon =~ /GT./;
128 return "F" if $codon =~ /TT((T|C)|Y)/;
129 return "C" if $codon =~ /TG((T|C)|Y)/;
130 return "A" if $codon =~ /GC./;
131 return "G" if $codon =~ /GG./;
132 return "P" if $codon =~ /CC./;
133 return "T" if $codon =~ /AC./;
134 return "S" if $codon =~ /(TC.)|(AG((T|C)|Y))/;
135 return "Y" if $codon =~ /TA((T|C)|Y)/;
136 return "W" if $codon =~ /TGG/;
137 return "Q" if $codon =~ /CA((A|G)|R)/;
138 return "N" if $codon =~ /AA((T|C)|Y)/;
139 return "H" if $codon =~ /CA((T|C)|Y)/;
140 return "E" if $codon =~ /GA((A|G)|R)/;
141 return "D" if $codon =~ /GA((T|C)|Y)/;
142 return "K" if $codon =~ /AA((A|G)|R)/;
143 return "R" if $codon =~ /(CG.)|(AG((A|G)|R))/;
148 =head3 cds_insert_gaps
150 Type must be 'cds', use this function if you have an
151 un-gapped cds and would like to gap-it, given a gapped
152 protein sequence returned by the alignment program.
154 Arguments: A gapped *protein* sequence
156 Side Effects: Gaps up the current CDS. Also, I recommend
157 setting 'cds_nocheck' in the constructor, as this will
158 run MUCH more quickly if you already know that the
159 protein sequence and CDS are properly translated.
163 sub cds_insert_gaps
{
165 unless($self->{type
} eq "cds"){
166 print STDERR
"Type not set to 'cds', early return\n";
169 my $gapped_prot_seq = shift;
171 my $ungap_prot = $gapped_prot_seq;
172 $ungap_prot =~ s/\W//g;
173 unless(length($self->{seq
}) <= (3*length($ungap_prot)+3)) {
174 my $message = "Error: Protein length (" . length($ungap_prot) .
175 ") must be less than or equal to 1/3 of CDS length ( " .
176 length($self->{seq
}) .
177 " minus stop codon) for gapping to work. Early return.\n";
178 print STDERR
$message;
182 my $revprot = reverse $gapped_prot_seq;
185 while(my $aa = chop($revprot)){
187 $cds_gapped .= "---";
190 my $codon = substr($self->{seq
}, $cds_pos, 3);
191 die "Protein seq does not match CDS at position $cds_pos (" . $self->{id
} . ")\n" unless ($self->{cds_nocheck
} || $aa == ($self->_translate_codon($codon)) );
192 $cds_gapped .= $codon;
196 $self->{seq
} = $cds_gapped;
200 =head2 use_liberal_cds()
202 Mid-sequence stop codons ignored, replaced w/ 'X'
203 Incomplete CDS is allowed, assuming frame starts on
208 sub use_liberal_cds
{
210 $self->{strict_cds
} = 0;
211 $self->{ignore_stop
} = 1;
212 $self->{allow_incomplete
} = 1;
215 =head2 use_strict_cds
217 CDS must be complete (length divisible by three),
218 starting with a methionine.
219 STOP codons mid-sequence will cause death.
225 $self->{strict_cds
} = 1;
226 $self->{ignore_stop
} = 0;
227 $self->{allow_incomplete
} = 0;