t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / Tools / CodonTable.pm
blobfdc409d290560a764f3ec2a3b0367bd5f54520ac
2 # bioperl module for Bio::Tools::CodonTable
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::CodonTable - Codon table object
18 =head1 SYNOPSIS
20 # This is a read-only class for all known codon tables. The IDs are
21 # the ones used by nucleotide sequence databases. All common IUPAC
22 # ambiguity codes for DNA, RNA and amino acids are recognized.
24 use Bio::Tools::CodonTable;
26 # defaults to ID 1 "Standard"
27 $myCodonTable = Bio::Tools::CodonTable->new();
28 $myCodonTable2 = Bio::Tools::CodonTable->new( -id => 3 );
30 # change codon table
31 $myCodonTable->id(5);
33 # examine codon table
34 print join (' ', "The name of the codon table no.", $myCodonTable->id(4),
35 "is:", $myCodonTable->name(), "\n");
37 # print possible codon tables
38 $tables = Bio::Tools::CodonTable->tables;
39 while ( ($id,$name) = each %{$tables} ) {
40 print "$id = $name\n";
43 # translate a codon
44 $aa = $myCodonTable->translate('ACU');
45 $aa = $myCodonTable->translate('act');
46 $aa = $myCodonTable->translate('ytr');
48 # reverse translate an amino acid
49 @codons = $myCodonTable->revtranslate('A');
50 @codons = $myCodonTable->revtranslate('Ser');
51 @codons = $myCodonTable->revtranslate('Glx');
52 @codons = $myCodonTable->revtranslate('cYS', 'rna');
54 # reverse translate an entire amino acid sequence into a IUPAC
55 # nucleotide string
57 my $seqobj = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
58 my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
60 # boolean tests
61 print "Is a start\n" if $myCodonTable->is_start_codon('ATG');
62 print "Is a terminator\n" if $myCodonTable->is_ter_codon('tar');
63 print "Is a unknown\n" if $myCodonTable->is_unknown_codon('JTG');
65 =head1 DESCRIPTION
67 Codon tables are also called translation tables or genetic codes
68 since that is what they represent. A bit more complete picture
69 of the full complexity of codon usage in various taxonomic groups
70 is presented at the NCBI Genetic Codes Home page.
72 CodonTable is a BioPerl class that knows all current translation
73 tables that are used by primary nucleotide sequence databases
74 (GenBank, EMBL and DDBJ). It provides methods to output information
75 about tables and relationships between codons and amino acids.
77 This class and its methods recognized all common IUPAC ambiguity codes
78 for DNA, RNA and animo acids. The translation method follows the
79 conventions in EMBL and TREMBL databases.
81 It is a nuisance to separate RNA and cDNA representations of nucleic
82 acid transcripts. The CodonTable object accepts codons of both type as
83 input and allows the user to set the mode for output when reverse
84 translating. Its default for output is DNA.
86 Note:
88 This class deals primarily with individual codons and amino
89 acids. However in the interest of speed you can L<translate>
90 longer sequence, too. The full complexity of protein translation
91 is tackled by L<Bio::PrimarySeqI::translate>.
94 The amino acid codes are IUPAC recommendations for common amino acids:
96 A Ala Alanine
97 R Arg Arginine
98 N Asn Asparagine
99 D Asp Aspartic acid
100 C Cys Cysteine
101 Q Gln Glutamine
102 E Glu Glutamic acid
103 G Gly Glycine
104 H His Histidine
105 I Ile Isoleucine
106 L Leu Leucine
107 K Lys Lysine
108 M Met Methionine
109 F Phe Phenylalanine
110 P Pro Proline
111 O Pyl Pyrrolysine (22nd amino acid)
112 U Sec Selenocysteine (21st amino acid)
113 S Ser Serine
114 T Thr Threonine
115 W Trp Tryptophan
116 Y Tyr Tyrosine
117 V Val Valine
118 B Asx Aspartic acid or Asparagine
119 Z Glx Glutamine or Glutamic acid
120 J Xle Isoleucine or Valine (mass spec ambiguity)
121 X Xaa Any or unknown amino acid
124 It is worth noting that, "Bacterial" codon table no. 11 produces an
125 polypeptide that is, confusingly, identical to the standard one. The
126 only differences are in available initiator codons.
129 NCBI Genetic Codes home page:
130 (Last update of the Genetic Codes: Nov. 18, 2016)
131 https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
133 ASN.1 version with ids 1 to 25 is at:
134 ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
136 Thanks to Matteo diTomasso for the original Perl implementation
137 of these tables.
139 =head1 FEEDBACK
141 =head2 Mailing Lists
143 User feedback is an integral part of the evolution of this and other
144 Bioperl modules. Send your comments and suggestions preferably to the
145 Bioperl mailing lists Your participation is much appreciated.
147 bioperl-l@bioperl.org - General discussion
148 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
150 =head2 Support
152 Please direct usage questions or support issues to the mailing list:
154 I<bioperl-l@bioperl.org>
156 rather than to the module maintainer directly. Many experienced and
157 reponsive experts will be able look at the problem and quickly
158 address it. Please include a thorough description of the problem
159 with code and data examples if at all possible.
161 =head2 Reporting Bugs
163 Report bugs to the Bioperl bug tracking system to help us keep track
164 the bugs and their resolution. Bug reports can be submitted via the
165 web:
167 https://github.com/bioperl/bioperl-live/issues
169 =head1 AUTHOR - Heikki Lehvaslaiho
171 Email: heikki-at-bioperl-dot-org
173 =head1 APPENDIX
175 The rest of the documentation details each of the object
176 methods. Internal methods are usually preceded with a _
178 =cut
180 # Let the code begin...
182 package Bio::Tools::CodonTable;
183 use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
184 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
185 use strict;
187 # Object preamble - inherits from Bio::Root::Root
188 use Bio::Tools::IUPAC;
189 use Bio::SeqUtils;
191 use base qw(Bio::Root::Root);
194 # first set internal values for all translation tables
196 BEGIN {
197 use constant CODONSIZE => 3;
198 $GAP = '-';
199 $CODONGAP = $GAP x CODONSIZE;
201 @NAMES = #id
203 'Strict', #0, special option for ATG-only start
204 'Standard', #1
205 'Vertebrate Mitochondrial',#2
206 'Yeast Mitochondrial',# 3
207 'Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma',#4
208 'Invertebrate Mitochondrial',#5
209 'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
210 '', '',
211 'Echinoderm and Flatworm Mitochondrial',#9
212 'Euplotid Nuclear',#10
213 'Bacterial, Archaeal and Plant Plastid',# 11
214 'Alternative Yeast Nuclear',# 12
215 'Ascidian Mitochondrial',# 13
216 'Alternative Flatworm Mitochondrial',# 14
218 'Chlorophycean Mitochondrial',# 16
219 '', '', '', '',
220 'Trematode Mitochondrial',# 21
221 'Scenedesmus obliquus Mitochondrial', #22
222 'Thraustochytrium Mitochondrial', #23
223 'Pterobranchia Mitochondrial', #24
224 'Candidate Division SR1 and Gracilibacteria', #25
225 'Pachysolen tannophilus Nuclear Code', #26
226 'Karyorelict Nuclear', #27
227 'Condylostoma Nuclear', #28
228 'Mesodinium Nuclear', #29
229 'Peritrich Nuclear', #30
230 'Blastocrithidia Nuclear' #31
233 @TABLES =
235 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
236 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
237 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
238 FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
239 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
240 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
241 FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
242 '' ''
243 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
244 FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
245 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
246 FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
247 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
248 FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
250 FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
251 '' '' '' ''
252 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
253 FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
254 FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
255 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG
256 FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
257 FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
258 FFLLSSSSYYQQCCWWLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
259 FFLLSSSSYYQQCCWWLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
260 FFLLSSSSYYYYCC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
261 FFLLSSSSYYEECC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
262 FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
265 # (bases used for these tables, for reference)
266 # 1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
267 # 2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
268 # 3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
270 @STARTS =
272 ----------**--*--------------------M----------------------------
273 ---M------**--*----M---------------M----------------------------
274 ----------**--------------------MMMM----------**---M------------
275 ----------**----------------------MM----------------------------
276 --MM------**-------M------------MMMM---------------M------------
277 ---M------**--------------------MMMM---------------M------------
278 --------------*--------------------M----------------------------
279 '' ''
280 ----------**-----------------------M---------------M------------
281 ----------**-----------------------M----------------------------
282 ---M------**--*----M------------MMMM---------------M------------
283 ----------**--*----M---------------M----------------------------
284 ---M------**----------------------MM---------------M------------
285 -----------*-----------------------M----------------------------
287 ----------*---*--------------------M----------------------------
288 '' '' '' ''
289 ----------**-----------------------M---------------M------------
290 ------*---*---*--------------------M----------------------------
291 --*-------**--*-----------------M--M---------------M------------
292 ---M------**-------M---------------M---------------M------------
293 ---M------**-----------------------M---------------M------------
294 ----------**--*----M---------------M----------------------------
295 --------------*--------------------M----------------------------
296 ----------**--*--------------------M----------------------------
297 --------------*--------------------M----------------------------
298 --------------*--------------------M----------------------------
299 ----------**-----------------------M----------------------------
302 my @nucs = qw(t c a g);
303 my $x = 0;
304 ($CODONS, $TRCOL) = ({}, {});
305 for my $i (@nucs) {
306 for my $j (@nucs) {
307 for my $k (@nucs) {
308 my $codon = "$i$j$k";
309 $CODONS->{$codon} = $x;
310 $TRCOL->{$x} = $codon;
311 $x++;
315 %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
316 %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
317 %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
318 $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
319 $TERMINATOR = '*';
322 sub new {
323 my($class,@args) = @_;
324 my $self = $class->SUPER::new(@args);
326 my($id) =
327 $self->_rearrange([qw(ID
329 @args);
331 $id = 1 if ( ! $id );
332 $id && $self->id($id);
333 return $self; # success - we hope!
336 =head2 id
338 Title : id
339 Usage : $obj->id(3); $id_integer = $obj->id();
340 Function: Sets or returns the id of the translation table. IDs are
341 integers from 0 (special ATG-only start) to 25, excluding
342 7-8 and 17-20 which have been removed. If an invalid ID is
343 given the method returns 1, the standard table.
344 Example :
345 Returns : value of id, a scalar, warn and fall back to 1 (standard table)
346 if specified id is not valid
347 Args : newvalue (optional)
349 =cut
351 sub id{
352 my ($self,$value) = @_;
353 if( defined $value) {
354 if ( not defined $TABLES[$value] or $TABLES[$value] eq '') {
355 $self->warn("Not a valid codon table ID [$value], using [1] instead ");
356 $value = 1;
358 $self->{'id'} = $value;
360 return $self->{'id'};
363 =head2 name
365 Title : name
366 Usage : $obj->name()
367 Function: returns the descriptive name of the translation table
368 Example :
369 Returns : A string
370 Args : None
373 =cut
375 sub name{
376 my ($self) = @_;
378 my ($id) = $self->{'id'};
379 return $NAMES[$id];
382 =head2 tables
384 Title : tables
385 Usage : $obj->tables() or Bio::Tools::CodonTable->tables()
386 Function: returns a hash reference where each key is a valid codon
387 table id() number, and each value is the corresponding
388 codon table name() string
389 Example :
390 Returns : A hashref
391 Args : None
394 =cut
396 sub tables{
397 my %tables;
398 for my $id (0 .. $#NAMES) {
399 my $name = $NAMES[$id];
400 $tables{$id} = $name if $name;
402 return \%tables;
405 =head2 translate
407 Title : translate
408 Usage : $obj->translate('YTR')
409 Function: Returns a string of one letter amino acid codes from
410 nucleotide sequence input. The imput can be of any length.
412 Returns 'X' for unknown codons and codons that code for
413 more than one amino acid. Returns an empty string if input
414 is not three characters long. Exceptions for these are:
416 - IUPAC amino acid code B for Aspartic Acid and
417 Asparagine, is used.
418 - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
419 used.
420 - if the codon is two nucleotides long and if by adding
421 an a third character 'N', it codes for a single amino
422 acid (with exceptions above), return that, otherwise
423 return empty string.
425 Returns empty string for other input strings that are not
426 three characters long.
428 Example :
429 Returns : a string of one letter ambiguous IUPAC amino acid codes
430 Args : ambiguous IUPAC nucleotide string
433 =cut
435 sub translate {
436 my ($self, $seq, $complete_codon) = @_;
437 $self->throw("Calling translate without a seq argument!") unless defined $seq;
438 return '' unless $seq;
440 my $id = $self->id;
441 my ($partial) = 0;
442 $partial = 2 if length($seq) % CODONSIZE == 2;
444 $seq = lc $seq;
445 $seq =~ tr/u/t/;
446 my $protein = "";
447 if ($seq =~ /[^actg]/ ) { #ambiguous chars
448 for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) {
449 my $triplet = substr($seq, $i, CODONSIZE);
450 if( $triplet eq $CODONGAP ) {
451 $protein .= $GAP;
452 } elsif (exists $CODONS->{$triplet}) {
453 $protein .= substr($TABLES[$id],
454 $CODONS->{$triplet},1);
455 } else {
456 $protein .= $self->_translate_ambiguous_codon($triplet);
459 } else { # simple, strict translation
460 for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) {
461 my $triplet = substr($seq, $i, CODONSIZE);
462 if( $triplet eq $CODONGAP ) {
463 $protein .= $GAP;
465 if (exists $CODONS->{$triplet}) {
466 $protein .= substr($TABLES[$id], $CODONS->{$triplet}, 1);
467 } else {
468 $protein .= 'X';
472 if ($partial == 2 && $complete_codon) { # 2 overhanging nucleotides
473 my $triplet = substr($seq, ($partial -4)). "n";
474 if( $triplet eq $CODONGAP ) {
475 $protein .= $GAP;
476 } elsif (exists $CODONS->{$triplet}) {
477 my $aa = substr($TABLES[$id], $CODONS->{$triplet},1);
478 $protein .= $aa;
479 } else {
480 $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
483 return $protein;
486 sub _translate_ambiguous_codon {
487 my ($self, $triplet, $partial) = @_;
488 $partial ||= 0;
489 my $id = $self->id;
490 my $aa;
491 my @codons = $self->unambiguous_codons($triplet);
492 my %aas =();
493 foreach my $codon (@codons) {
494 $aas{substr($TABLES[$id],$CODONS->{$codon},1)} = 1;
496 my $count = scalar keys %aas;
497 if ( $count == 1 ) {
498 $aa = (keys %aas)[0];
500 elsif ( $count == 2 ) {
501 if ($aas{'D'} and $aas{'N'}) {
502 $aa = 'B';
504 elsif ($aas{'E'} and $aas{'Q'}) {
505 $aa = 'Z';
506 } else {
507 $partial ? ($aa = '') : ($aa = 'X');
509 } else {
510 $partial ? ($aa = '') : ($aa = 'X');
512 return $aa;
515 =head2 translate_strict
517 Title : translate_strict
518 Usage : $obj->translate_strict('ACT')
519 Function: returns one letter amino acid code for a codon input
521 Fast and simple translation. User is responsible to resolve
522 ambiguous nucleotide codes before calling this
523 method. Returns 'X' for unknown codons and an empty string
524 for input strings that are not three characters long.
526 It is not recommended to use this method in a production
527 environment. Use method translate, instead.
529 Example :
530 Returns : A string
531 Args : a codon = a three nucleotide character string
534 =cut
536 sub translate_strict{
537 my ($self, $value) = @_;
538 my $id = $self->{'id'};
540 $value = lc $value;
541 $value =~ tr/u/t/;
543 return '' unless length $value == 3;
545 return 'X' unless defined $CODONS->{$value};
547 return substr( $TABLES[$id], $CODONS->{$value}, 1 );
550 =head2 revtranslate
552 Title : revtranslate
553 Usage : $obj->revtranslate('G')
554 Function: returns codons for an amino acid
556 Returns an empty string for unknown amino acid
557 codes. Ambiguous IUPAC codes Asx,B, (Asp,D; Asn,N) and
558 Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
559 letter amino acid codes are accepted. '*' and 'Ter' are
560 used for terminator.
562 By default, the output codons are shown in DNA. If the
563 output is needed in RNA (tr/t/u/), add a second argument
564 'RNA'.
566 Example : $obj->revtranslate('Gly', 'RNA')
567 Returns : An array of three lower case letter strings i.e. codons
568 Args : amino acid, 'RNA'
570 =cut
572 sub revtranslate {
573 my ($self, $value, $coding) = @_;
574 my @codons;
576 if (length($value) == 3 ) {
577 $value = lc $value;
578 $value = ucfirst $value;
579 $value = $THREELETTERSYMBOLS{$value};
581 if ( defined $value and $value =~ /$VALID_PROTEIN/
582 and length($value) == 1
584 my $id = $self->{'id'};
586 $value = uc $value;
587 my @aas = @{$IUPAC_AA{$value}};
588 foreach my $aa (@aas) {
589 #print $aa, " -2\n";
590 $aa = '\*' if $aa eq '*';
591 while ($TABLES[$id] =~ m/$aa/g) {
592 my $p = pos $TABLES[$id];
593 push (@codons, $TRCOL->{--$p});
598 if ($coding and uc ($coding) eq 'RNA') {
599 for my $i (0..$#codons) {
600 $codons[$i] =~ tr/t/u/;
604 return @codons;
607 =head2 reverse_translate_all
609 Title : reverse_translate_all
610 Usage : my $iup_str = $cttable->reverse_translate_all($seq_object)
611 my $iup_str = $cttable->reverse_translate_all($seq_object,
612 $cutable,
613 15);
614 Function: reverse translates a protein sequence into IUPAC nucleotide
615 sequence. An 'X' in the protein sequence is converted to 'NNN'
616 in the nucleotide sequence.
617 Returns : a string
618 Args : a Bio::PrimarySeqI compatible object (mandatory)
619 a Bio::CodonUsage::Table object and a threshold if only
620 codons with a relative frequency above the threshold are
621 to be considered.
622 =cut
624 sub reverse_translate_all {
625 my ($self, $obj, $cut, $threshold) = @_;
627 ## check args are OK
629 if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
630 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
631 ref($obj) . "]");
633 if($obj->alphabet ne 'protein') {
634 $self->throw("Cannot reverse translate, need an amino acid sequence .".
635 "This sequence is of type [" . $obj->alphabet ."]");
637 my @data;
638 my @seq = split '', $obj->seq;
640 ## if we're not supplying a codon usage table...
641 if( !$cut && !$threshold) {
642 ## get lists of possible codons for each aa.
643 for my $aa (@seq) {
644 if ($aa =~ /x/i) {
645 push @data, (['NNN']);
646 }else {
647 my @cods = $self->revtranslate($aa);
648 push @data, \@cods;
651 }else{
652 #else we are supplying a codon usage table, we just want common codons
653 #check args first.
654 if(!$cut->isa('Bio::CodonUsage::Table')) {
655 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
656 ref($cut). "].");
658 my $cod_ref = $cut->probable_codons($threshold);
659 for my $aa (@seq) {
660 if ($aa =~ /x/i) {
661 push @data, (['NNN']);
662 next;
664 push @data, $cod_ref->{$aa};
668 return $self->_make_iupac_string(\@data);
671 =head2 reverse_translate_best
673 Title : reverse_translate_best
674 Usage : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
675 Function: Reverse translates a protein sequence into plain nucleotide
676 sequence (GATC), uses the most common codon for each amino acid
677 Returns : A string
678 Args : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
680 =cut
682 sub reverse_translate_best {
684 my ($self, $obj, $cut) = @_;
686 if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
687 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
688 ref($obj) . "]");
690 if ($obj->alphabet ne 'protein') {
691 $self->throw("Cannot reverse translate, need an amino acid sequence .".
692 "This sequence is of type [" . $obj->alphabet ."]");
694 if ( !$cut | !$cut->isa('Bio::CodonUsage::Table')) {
695 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
696 ref($cut). "].");
699 my $str = '';
700 my @seq = split '', $obj->seq;
702 my $cod_ref = $cut->most_common_codons();
704 for my $aa ( @seq ) {
705 if ($aa =~ /x/i) {
706 $str .= 'NNN';
707 next;
709 if ( defined $cod_ref->{$aa} ) {
710 $str .= $cod_ref->{$aa};
711 } else {
712 $self->throw("Input sequence contains invalid character: $aa");
715 return $str;
718 =head2 is_start_codon
720 Title : is_start_codon
721 Usage : $obj->is_start_codon('ATG')
722 Function: returns true (1) for all codons that can be used as a
723 translation start, false (0) for others.
724 Example : $myCodonTable->is_start_codon('ATG')
725 Returns : boolean
726 Args : codon
728 =cut
730 sub is_start_codon{
731 shift->_codon_is( shift, \@STARTS, 'M' );
734 =head2 is_ter_codon
736 Title : is_ter_codon
737 Usage : $obj->is_ter_codon('GAA')
738 Function: returns true (1) for all codons that can be used as a
739 translation tarminator, false (0) for others.
740 Example : $myCodonTable->is_ter_codon('ATG')
741 Returns : boolean
742 Args : codon
744 =cut
746 sub is_ter_codon{
747 my ($self, $value) = @_;
748 my $id = $self->{'id'};
750 # We need to ensure U is mapped to T (ie. UAG)
751 $value = uc $value;
752 $value =~ tr/U/T/;
754 if (length $value != 3 ) {
755 # Incomplete codons are not stop codons
756 return 0;
757 } else {
758 my $result = 0;
760 # For all the possible codons, if any are not a stop
761 # codon, fail immediately
762 for my $c ( $self->unambiguous_codons($value) ) {
763 my $m = substr( $TABLES[$id], $CODONS->{$c}, 1 );
764 if($m eq $TERMINATOR) {
765 $result = 1;
766 } else {
767 return 0;
770 return $result;
774 # desc: compares the passed value with a single entry in the given
775 # codon table
776 # args: a value (typically a three-char string like 'atg'),
777 # a reference to the appropriate set of codon tables,
778 # a single-character value to check for at the position in the
779 # given codon table
780 # ret: boolean, true if the given codon table contains the $key at the
781 # position corresponding to $value
782 sub _codon_is {
783 my ($self, $value, $table, $key ) = @_;
785 return 0 unless length $value == 3;
787 $value = lc $value;
788 $value =~ tr/u/t/;
790 my $id = $self->{'id'};
791 for my $c ( $self->unambiguous_codons($value) ) {
792 my $m = substr( $table->[$id], $CODONS->{$c}, 1 );
793 if ($m eq $key) { return 1; }
795 return 0;
798 =head2 is_unknown_codon
800 Title : is_unknown_codon
801 Usage : $obj->is_unknown_codon('GAJ')
802 Function: returns false (0) for all codons that are valid,
803 true (1) for others.
804 Example : $myCodonTable->is_unknown_codon('NTG')
805 Returns : boolean
806 Args : codon
809 =cut
811 sub is_unknown_codon{
812 my ($self, $value) = @_;
813 $value = lc $value;
814 $value =~ tr/u/t/;
815 return 1 unless $self->unambiguous_codons($value);
816 return 0;
819 =head2 unambiguous_codons
821 Title : unambiguous_codons
822 Usage : @codons = $self->unambiguous_codons('ACN')
823 Returns : array of strings (one-letter unambiguous amino acid codes)
824 Args : a codon = a three IUPAC nucleotide character string
826 =cut
828 sub unambiguous_codons{
829 my ($self,$value) = @_;
830 my @nts = map { $IUPAC_DNA{uc $_} } split(//, $value);
832 my @codons;
833 for my $i ( @{$nts[0]} ) {
834 for my $j ( @{$nts[1]} ) {
835 for my $k ( @{$nts[2]} ) {
836 push @codons, lc "$i$j$k";
838 return @codons;
841 =head2 _unambiquous_codons
843 deprecated, now an alias for unambiguous_codons
845 =cut
847 sub _unambiquous_codons {
848 unambiguous_codons( undef, @_ );
851 =head2 add_table
853 Title : add_table
854 Usage : $newid = $ct->add_table($name, $table, $starts)
855 Function: Add a custom Codon Table into the object.
856 Know what you are doing, only the length of
857 the argument strings is checked!
858 Returns : the id of the new codon table
859 Args : name, a string, optional (can be empty)
860 table, a string of 64 characters
861 startcodons, a string of 64 characters, defaults to standard
863 =cut
865 sub add_table {
866 my ($self, $name, $table, $starts) = @_;
868 $name ||= 'Custom' . $#NAMES + 1;
869 $starts ||= $STARTS[1];
870 $self->throw('Suspect input!')
871 unless length($table) == 64 and length($starts) == 64;
873 push @NAMES, $name;
874 push @TABLES, $table;
875 push @STARTS, $starts;
877 return $#NAMES;
880 sub _make_iupac_string {
881 my ($self, $cod_ref) = @_;
882 if(ref($cod_ref) ne 'ARRAY') {
883 $self->throw(" I need a reference to a list of references to codons, ".
884 " not a [". ref($cod_ref) . "].");
886 my %iupac_hash = Bio::Tools::IUPAC->iupac_rev_iub();
887 my $iupac_string = ''; ## the string to be returned
888 for my $aa (@$cod_ref) {
890 ## scan through codon positions, record the differing values,
891 # then look up in the iub hash
892 for my $index(0..2) {
893 my %h;
894 map { my $k = substr($_,$index,1);
895 $h{$k} = undef;} @$aa;
896 my $lookup_key = join '', sort{$a cmp $b}keys %h;
898 ## extend string
899 $iupac_string .= $iupac_hash{uc$lookup_key};
902 return $iupac_string;