Fix Codon Table 0 (strict) that was not selectable
[bioperl-live.git] / lib / Bio / Tools / CodonTable.pm
blob315c541d48446f5a74a332f8687f9612d8227c56
2 # bioperl module for Bio::Tools::CodonTable
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::CodonTable - Codon table object
18 =head1 SYNOPSIS
20 # This is a read-only class for all known codon tables. The IDs are
21 # the ones used by nucleotide sequence databases. All common IUPAC
22 # ambiguity codes for DNA, RNA and amino acids are recognized.
24 use Bio::Tools::CodonTable;
26 # defaults to ID 1 "Standard"
27 $myCodonTable = Bio::Tools::CodonTable->new();
28 $myCodonTable2 = Bio::Tools::CodonTable->new( -id => 3 );
30 # change codon table
31 $myCodonTable->id(5);
33 # examine codon table
34 print join (' ', "The name of the codon table no.", $myCodonTable->id(4),
35 "is:", $myCodonTable->name(), "\n");
37 # print possible codon tables
38 $tables = Bio::Tools::CodonTable->tables;
39 while ( ($id,$name) = each %{$tables} ) {
40 print "$id = $name\n";
43 # translate a codon
44 $aa = $myCodonTable->translate('ACU');
45 $aa = $myCodonTable->translate('act');
46 $aa = $myCodonTable->translate('ytr');
48 # reverse translate an amino acid
49 @codons = $myCodonTable->revtranslate('A');
50 @codons = $myCodonTable->revtranslate('Ser');
51 @codons = $myCodonTable->revtranslate('Glx');
52 @codons = $myCodonTable->revtranslate('cYS', 'rna');
54 # reverse translate an entire amino acid sequence into a IUPAC
55 # nucleotide string
57 my $seqobj = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
58 my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
60 # boolean tests
61 print "Is a start\n" if $myCodonTable->is_start_codon('ATG');
62 print "Is a terminator\n" if $myCodonTable->is_ter_codon('tar');
63 print "Is a unknown\n" if $myCodonTable->is_unknown_codon('JTG');
65 =head1 DESCRIPTION
67 Codon tables are also called translation tables or genetic codes
68 since that is what they represent. A bit more complete picture
69 of the full complexity of codon usage in various taxonomic groups
70 is presented at the NCBI Genetic Codes Home page.
72 CodonTable is a BioPerl class that knows all current translation
73 tables that are used by primary nucleotide sequence databases
74 (GenBank, EMBL and DDBJ). It provides methods to output information
75 about tables and relationships between codons and amino acids.
77 This class and its methods recognized all common IUPAC ambiguity codes
78 for DNA, RNA and animo acids. The translation method follows the
79 conventions in EMBL and TREMBL databases.
81 It is a nuisance to separate RNA and cDNA representations of nucleic
82 acid transcripts. The CodonTable object accepts codons of both type as
83 input and allows the user to set the mode for output when reverse
84 translating. Its default for output is DNA.
86 Note:
88 This class deals primarily with individual codons and amino
89 acids. However in the interest of speed you can L<translate>
90 longer sequence, too. The full complexity of protein translation
91 is tackled by L<Bio::PrimarySeqI::translate>.
94 The amino acid codes are IUPAC recommendations for common amino acids:
96 A Ala Alanine
97 R Arg Arginine
98 N Asn Asparagine
99 D Asp Aspartic acid
100 C Cys Cysteine
101 Q Gln Glutamine
102 E Glu Glutamic acid
103 G Gly Glycine
104 H His Histidine
105 I Ile Isoleucine
106 L Leu Leucine
107 K Lys Lysine
108 M Met Methionine
109 F Phe Phenylalanine
110 P Pro Proline
111 O Pyl Pyrrolysine (22nd amino acid)
112 U Sec Selenocysteine (21st amino acid)
113 S Ser Serine
114 T Thr Threonine
115 W Trp Tryptophan
116 Y Tyr Tyrosine
117 V Val Valine
118 B Asx Aspartic acid or Asparagine
119 Z Glx Glutamine or Glutamic acid
120 J Xle Isoleucine or Valine (mass spec ambiguity)
121 X Xaa Any or unknown amino acid
124 It is worth noting that, "Bacterial" codon table no. 11 produces an
125 polypeptide that is, confusingly, identical to the standard one. The
126 only differences are in available initiator codons.
129 NCBI Genetic Codes home page:
130 (Last update of the Genetic Codes: Nov. 18, 2016)
131 https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
133 ASN.1 version with ids 1 to 25 is at:
134 ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
136 Thanks to Matteo diTomasso for the original Perl implementation
137 of these tables.
139 =head1 FEEDBACK
141 =head2 Mailing Lists
143 User feedback is an integral part of the evolution of this and other
144 Bioperl modules. Send your comments and suggestions preferably to the
145 Bioperl mailing lists Your participation is much appreciated.
147 bioperl-l@bioperl.org - General discussion
148 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
150 =head2 Support
152 Please direct usage questions or support issues to the mailing list:
154 I<bioperl-l@bioperl.org>
156 rather than to the module maintainer directly. Many experienced and
157 reponsive experts will be able look at the problem and quickly
158 address it. Please include a thorough description of the problem
159 with code and data examples if at all possible.
161 =head2 Reporting Bugs
163 Report bugs to the Bioperl bug tracking system to help us keep track
164 the bugs and their resolution. Bug reports can be submitted via the
165 web:
167 https://github.com/bioperl/bioperl-live/issues
169 =head1 AUTHOR - Heikki Lehvaslaiho
171 Email: heikki-at-bioperl-dot-org
173 =head1 APPENDIX
175 The rest of the documentation details each of the object
176 methods. Internal methods are usually preceded with a _
178 =cut
180 # Let the code begin...
182 package Bio::Tools::CodonTable;
184 use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
185 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
186 use strict;
188 # Object preamble - inherits from Bio::Root::Root
189 use Bio::Tools::IUPAC;
190 use Bio::SeqUtils;
192 use base qw(Bio::Root::Root);
195 # first set internal values for all translation tables
197 BEGIN {
198 use constant CODONSIZE => 3;
199 $GAP = '-';
200 $CODONGAP = $GAP x CODONSIZE;
202 @NAMES = #id
204 'Strict', #0, special option for ATG-only start
205 'Standard', #1
206 'Vertebrate Mitochondrial',#2
207 'Yeast Mitochondrial',# 3
208 'Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma',#4
209 'Invertebrate Mitochondrial',#5
210 'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
211 '', '',
212 'Echinoderm and Flatworm Mitochondrial',#9
213 'Euplotid Nuclear',#10
214 'Bacterial, Archaeal and Plant Plastid',# 11
215 'Alternative Yeast Nuclear',# 12
216 'Ascidian Mitochondrial',# 13
217 'Alternative Flatworm Mitochondrial',# 14
219 'Chlorophycean Mitochondrial',# 16
220 '', '', '', '',
221 'Trematode Mitochondrial',# 21
222 'Scenedesmus obliquus Mitochondrial', #22
223 'Thraustochytrium Mitochondrial', #23
224 'Pterobranchia Mitochondrial', #24
225 'Candidate Division SR1 and Gracilibacteria', #25
226 'Pachysolen tannophilus Nuclear Code', #26
227 'Karyorelict Nuclear', #27
228 'Condylostoma Nuclear', #28
229 'Mesodinium Nuclear', #29
230 'Peritrich Nuclear', #30
231 'Blastocrithidia Nuclear' #31
234 @TABLES =
236 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
237 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
238 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
239 FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
240 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
241 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
242 FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
243 '' ''
244 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
245 FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
246 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
247 FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
248 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
249 FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
251 FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
252 '' '' '' ''
253 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
254 FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
255 FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
256 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG
257 FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
258 FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
259 FFLLSSSSYYQQCCWWLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
260 FFLLSSSSYYQQCCWWLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
261 FFLLSSSSYYYYCC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
262 FFLLSSSSYYEECC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
263 FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
266 # (bases used for these tables, for reference)
267 # 1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
268 # 2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
269 # 3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
271 @STARTS =
273 ----------**--*--------------------M----------------------------
274 ---M------**--*----M---------------M----------------------------
275 ----------**--------------------MMMM----------**---M------------
276 ----------**----------------------MM----------------------------
277 --MM------**-------M------------MMMM---------------M------------
278 ---M------**--------------------MMMM---------------M------------
279 --------------*--------------------M----------------------------
280 '' ''
281 ----------**-----------------------M---------------M------------
282 ----------**-----------------------M----------------------------
283 ---M------**--*----M------------MMMM---------------M------------
284 ----------**--*----M---------------M----------------------------
285 ---M------**----------------------MM---------------M------------
286 -----------*-----------------------M----------------------------
288 ----------*---*--------------------M----------------------------
289 '' '' '' ''
290 ----------**-----------------------M---------------M------------
291 ------*---*---*--------------------M----------------------------
292 --*-------**--*-----------------M--M---------------M------------
293 ---M------**-------M---------------M---------------M------------
294 ---M------**-----------------------M---------------M------------
295 ----------**--*----M---------------M----------------------------
296 --------------*--------------------M----------------------------
297 ----------**--*--------------------M----------------------------
298 --------------*--------------------M----------------------------
299 --------------*--------------------M----------------------------
300 ----------**-----------------------M----------------------------
303 my @nucs = qw(t c a g);
304 my $x = 0;
305 ($CODONS, $TRCOL) = ({}, {});
306 for my $i (@nucs) {
307 for my $j (@nucs) {
308 for my $k (@nucs) {
309 my $codon = "$i$j$k";
310 $CODONS->{$codon} = $x;
311 $TRCOL->{$x} = $codon;
312 $x++;
316 %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
317 %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
318 %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
319 $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
320 $TERMINATOR = '*';
323 sub new {
324 my($class,@args) = @_;
325 my $self = $class->SUPER::new(@args);
327 my($id) =
328 $self->_rearrange([qw(ID
330 @args);
332 $id = 1 if ( ! defined ( $id ) );
333 $id && $self->id($id);
334 return $self; # success - we hope!
337 =head2 id
339 Title : id
340 Usage : $obj->id(3); $id_integer = $obj->id();
341 Function: Sets or returns the id of the translation table. IDs are
342 integers from 0 (special ATG-only start) to 25, excluding
343 7-8 and 17-20 which have been removed. If an invalid ID is
344 given the method returns 1, the standard table.
345 Example :
346 Returns : value of id, a scalar, warn and fall back to 1 (standard table)
347 if specified id is not valid
348 Args : newvalue (optional)
350 =cut
352 sub id{
353 my ($self,$value) = @_;
354 if( defined $value) {
355 if ( not defined $TABLES[$value] or $TABLES[$value] eq '') {
356 $self->warn("Not a valid codon table ID [$value], using [1] instead ");
357 $value = 1;
359 $self->{'id'} = $value;
361 return $self->{'id'};
364 =head2 name
366 Title : name
367 Usage : $obj->name()
368 Function: returns the descriptive name of the translation table
369 Example :
370 Returns : A string
371 Args : None
374 =cut
376 sub name{
377 my ($self) = @_;
379 my ($id) = $self->{'id'};
380 return $NAMES[$id];
383 =head2 tables
385 Title : tables
386 Usage : $obj->tables() or Bio::Tools::CodonTable->tables()
387 Function: returns a hash reference where each key is a valid codon
388 table id() number, and each value is the corresponding
389 codon table name() string
390 Example :
391 Returns : A hashref
392 Args : None
395 =cut
397 sub tables{
398 my %tables;
399 for my $id (0 .. $#NAMES) {
400 my $name = $NAMES[$id];
401 $tables{$id} = $name if $name;
403 return \%tables;
406 =head2 translate
408 Title : translate
409 Usage : $obj->translate('YTR')
410 Function: Returns a string of one letter amino acid codes from
411 nucleotide sequence input. The imput can be of any length.
413 Returns 'X' for unknown codons and codons that code for
414 more than one amino acid. Returns an empty string if input
415 is not three characters long. Exceptions for these are:
417 - IUPAC amino acid code B for Aspartic Acid and
418 Asparagine, is used.
419 - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
420 used.
421 - if the codon is two nucleotides long and if by adding
422 an a third character 'N', it codes for a single amino
423 acid (with exceptions above), return that, otherwise
424 return empty string.
426 Returns empty string for other input strings that are not
427 three characters long.
429 Example :
430 Returns : a string of one letter ambiguous IUPAC amino acid codes
431 Args : ambiguous IUPAC nucleotide string
434 =cut
436 sub translate {
437 my ($self, $seq, $complete_codon) = @_;
438 $self->throw("Calling translate without a seq argument!") unless defined $seq;
439 return '' unless $seq;
441 my $id = $self->id;
442 my ($partial) = 0;
443 $partial = 2 if length($seq) % CODONSIZE == 2;
445 $seq = lc $seq;
446 $seq =~ tr/u/t/;
447 my $protein = "";
448 if ($seq =~ /[^actg]/ ) { #ambiguous chars
449 for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) {
450 my $triplet = substr($seq, $i, CODONSIZE);
451 if( $triplet eq $CODONGAP ) {
452 $protein .= $GAP;
453 } elsif (exists $CODONS->{$triplet}) {
454 $protein .= substr($TABLES[$id],
455 $CODONS->{$triplet},1);
456 } else {
457 $protein .= $self->_translate_ambiguous_codon($triplet);
460 } else { # simple, strict translation
461 for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) {
462 my $triplet = substr($seq, $i, CODONSIZE);
463 if( $triplet eq $CODONGAP ) {
464 $protein .= $GAP;
466 if (exists $CODONS->{$triplet}) {
467 $protein .= substr($TABLES[$id], $CODONS->{$triplet}, 1);
468 } else {
469 $protein .= 'X';
473 if ($partial == 2 && $complete_codon) { # 2 overhanging nucleotides
474 my $triplet = substr($seq, ($partial -4)). "n";
475 if( $triplet eq $CODONGAP ) {
476 $protein .= $GAP;
477 } elsif (exists $CODONS->{$triplet}) {
478 my $aa = substr($TABLES[$id], $CODONS->{$triplet},1);
479 $protein .= $aa;
480 } else {
481 $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
484 return $protein;
487 sub _translate_ambiguous_codon {
488 my ($self, $triplet, $partial) = @_;
489 $partial ||= 0;
490 my $id = $self->id;
491 my $aa;
492 my @codons = $self->unambiguous_codons($triplet);
493 my %aas =();
494 foreach my $codon (@codons) {
495 $aas{substr($TABLES[$id],$CODONS->{$codon},1)} = 1;
497 my $count = scalar keys %aas;
498 if ( $count == 1 ) {
499 $aa = (keys %aas)[0];
501 elsif ( $count == 2 ) {
502 if ($aas{'D'} and $aas{'N'}) {
503 $aa = 'B';
505 elsif ($aas{'E'} and $aas{'Q'}) {
506 $aa = 'Z';
507 } else {
508 $partial ? ($aa = '') : ($aa = 'X');
510 } else {
511 $partial ? ($aa = '') : ($aa = 'X');
513 return $aa;
516 =head2 translate_strict
518 Title : translate_strict
519 Usage : $obj->translate_strict('ACT')
520 Function: returns one letter amino acid code for a codon input
522 Fast and simple translation. User is responsible to resolve
523 ambiguous nucleotide codes before calling this
524 method. Returns 'X' for unknown codons and an empty string
525 for input strings that are not three characters long.
527 It is not recommended to use this method in a production
528 environment. Use method translate, instead.
530 Example :
531 Returns : A string
532 Args : a codon = a three nucleotide character string
535 =cut
537 sub translate_strict{
538 my ($self, $value) = @_;
539 my $id = $self->{'id'};
541 $value = lc $value;
542 $value =~ tr/u/t/;
544 return '' unless length $value == 3;
546 return 'X' unless defined $CODONS->{$value};
548 return substr( $TABLES[$id], $CODONS->{$value}, 1 );
551 =head2 revtranslate
553 Title : revtranslate
554 Usage : $obj->revtranslate('G')
555 Function: returns codons for an amino acid
557 Returns an empty string for unknown amino acid
558 codes. Ambiguous IUPAC codes Asx,B, (Asp,D; Asn,N) and
559 Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
560 letter amino acid codes are accepted. '*' and 'Ter' are
561 used for terminator.
563 By default, the output codons are shown in DNA. If the
564 output is needed in RNA (tr/t/u/), add a second argument
565 'RNA'.
567 Example : $obj->revtranslate('Gly', 'RNA')
568 Returns : An array of three lower case letter strings i.e. codons
569 Args : amino acid, 'RNA'
571 =cut
573 sub revtranslate {
574 my ($self, $value, $coding) = @_;
575 my @codons;
577 if (length($value) == 3 ) {
578 $value = lc $value;
579 $value = ucfirst $value;
580 $value = $THREELETTERSYMBOLS{$value};
582 if ( defined $value and $value =~ /$VALID_PROTEIN/
583 and length($value) == 1
585 my $id = $self->{'id'};
587 $value = uc $value;
588 my @aas = @{$IUPAC_AA{$value}};
589 foreach my $aa (@aas) {
590 #print $aa, " -2\n";
591 $aa = '\*' if $aa eq '*';
592 while ($TABLES[$id] =~ m/$aa/g) {
593 my $p = pos $TABLES[$id];
594 push (@codons, $TRCOL->{--$p});
599 if ($coding and uc ($coding) eq 'RNA') {
600 for my $i (0..$#codons) {
601 $codons[$i] =~ tr/t/u/;
605 return @codons;
608 =head2 reverse_translate_all
610 Title : reverse_translate_all
611 Usage : my $iup_str = $cttable->reverse_translate_all($seq_object)
612 my $iup_str = $cttable->reverse_translate_all($seq_object,
613 $cutable,
614 15);
615 Function: reverse translates a protein sequence into IUPAC nucleotide
616 sequence. An 'X' in the protein sequence is converted to 'NNN'
617 in the nucleotide sequence.
618 Returns : a string
619 Args : a Bio::PrimarySeqI compatible object (mandatory)
620 a Bio::CodonUsage::Table object and a threshold if only
621 codons with a relative frequency above the threshold are
622 to be considered.
623 =cut
625 sub reverse_translate_all {
626 my ($self, $obj, $cut, $threshold) = @_;
628 ## check args are OK
630 if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
631 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
632 ref($obj) . "]");
634 if($obj->alphabet ne 'protein') {
635 $self->throw("Cannot reverse translate, need an amino acid sequence .".
636 "This sequence is of type [" . $obj->alphabet ."]");
638 my @data;
639 my @seq = split '', $obj->seq;
641 ## if we're not supplying a codon usage table...
642 if( !$cut && !$threshold) {
643 ## get lists of possible codons for each aa.
644 for my $aa (@seq) {
645 if ($aa =~ /x/i) {
646 push @data, (['NNN']);
647 }else {
648 my @cods = $self->revtranslate($aa);
649 push @data, \@cods;
652 }else{
653 #else we are supplying a codon usage table, we just want common codons
654 #check args first.
655 if(!$cut->isa('Bio::CodonUsage::Table')) {
656 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
657 ref($cut). "].");
659 my $cod_ref = $cut->probable_codons($threshold);
660 for my $aa (@seq) {
661 if ($aa =~ /x/i) {
662 push @data, (['NNN']);
663 next;
665 push @data, $cod_ref->{$aa};
669 return $self->_make_iupac_string(\@data);
672 =head2 reverse_translate_best
674 Title : reverse_translate_best
675 Usage : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
676 Function: Reverse translates a protein sequence into plain nucleotide
677 sequence (GATC), uses the most common codon for each amino acid
678 Returns : A string
679 Args : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
681 =cut
683 sub reverse_translate_best {
685 my ($self, $obj, $cut) = @_;
687 if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
688 $self->throw(" I need a Bio::PrimarySeqI object, not a [".
689 ref($obj) . "]");
691 if ($obj->alphabet ne 'protein') {
692 $self->throw("Cannot reverse translate, need an amino acid sequence .".
693 "This sequence is of type [" . $obj->alphabet ."]");
695 if ( !$cut | !$cut->isa('Bio::CodonUsage::Table')) {
696 $self->throw("I need a Bio::CodonUsage::Table object, not a [".
697 ref($cut). "].");
700 my $str = '';
701 my @seq = split '', $obj->seq;
703 my $cod_ref = $cut->most_common_codons();
705 for my $aa ( @seq ) {
706 if ($aa =~ /x/i) {
707 $str .= 'NNN';
708 next;
710 if ( defined $cod_ref->{$aa} ) {
711 $str .= $cod_ref->{$aa};
712 } else {
713 $self->throw("Input sequence contains invalid character: $aa");
716 return $str;
719 =head2 is_start_codon
721 Title : is_start_codon
722 Usage : $obj->is_start_codon('ATG')
723 Function: returns true (1) for all codons that can be used as a
724 translation start, false (0) for others.
725 Example : $myCodonTable->is_start_codon('ATG')
726 Returns : boolean
727 Args : codon
729 =cut
731 sub is_start_codon{
732 shift->_codon_is( shift, \@STARTS, 'M' );
735 =head2 is_ter_codon
737 Title : is_ter_codon
738 Usage : $obj->is_ter_codon('GAA')
739 Function: returns true (1) for all codons that can be used as a
740 translation tarminator, false (0) for others.
741 Example : $myCodonTable->is_ter_codon('ATG')
742 Returns : boolean
743 Args : codon
745 =cut
747 sub is_ter_codon{
748 my ($self, $value) = @_;
749 my $id = $self->{'id'};
751 # We need to ensure U is mapped to T (ie. UAG)
752 $value = uc $value;
753 $value =~ tr/U/T/;
755 if (length $value != 3 ) {
756 # Incomplete codons are not stop codons
757 return 0;
758 } else {
759 my $result = 0;
761 # For all the possible codons, if any are not a stop
762 # codon, fail immediately
763 for my $c ( $self->unambiguous_codons($value) ) {
764 my $m = substr( $TABLES[$id], $CODONS->{$c}, 1 );
765 if($m eq $TERMINATOR) {
766 $result = 1;
767 } else {
768 return 0;
771 return $result;
775 # desc: compares the passed value with a single entry in the given
776 # codon table
777 # args: a value (typically a three-char string like 'atg'),
778 # a reference to the appropriate set of codon tables,
779 # a single-character value to check for at the position in the
780 # given codon table
781 # ret: boolean, true if the given codon table contains the $key at the
782 # position corresponding to $value
783 sub _codon_is {
784 my ($self, $value, $table, $key ) = @_;
786 return 0 unless length $value == 3;
788 $value = lc $value;
789 $value =~ tr/u/t/;
791 my $id = $self->{'id'};
792 for my $c ( $self->unambiguous_codons($value) ) {
793 my $m = substr( $table->[$id], $CODONS->{$c}, 1 );
794 if ($m eq $key) { return 1; }
796 return 0;
799 =head2 is_unknown_codon
801 Title : is_unknown_codon
802 Usage : $obj->is_unknown_codon('GAJ')
803 Function: returns false (0) for all codons that are valid,
804 true (1) for others.
805 Example : $myCodonTable->is_unknown_codon('NTG')
806 Returns : boolean
807 Args : codon
810 =cut
812 sub is_unknown_codon{
813 my ($self, $value) = @_;
814 $value = lc $value;
815 $value =~ tr/u/t/;
816 return 1 unless $self->unambiguous_codons($value);
817 return 0;
820 =head2 unambiguous_codons
822 Title : unambiguous_codons
823 Usage : @codons = $self->unambiguous_codons('ACN')
824 Returns : array of strings (one-letter unambiguous amino acid codes)
825 Args : a codon = a three IUPAC nucleotide character string
827 =cut
829 sub unambiguous_codons{
830 my ($self,$value) = @_;
831 my @nts = map { $IUPAC_DNA{uc $_} } split(//, $value);
833 my @codons;
834 for my $i ( @{$nts[0]} ) {
835 for my $j ( @{$nts[1]} ) {
836 for my $k ( @{$nts[2]} ) {
837 push @codons, lc "$i$j$k";
839 return @codons;
842 =head2 _unambiquous_codons
844 deprecated, now an alias for unambiguous_codons
846 =cut
848 sub _unambiquous_codons {
849 unambiguous_codons( undef, @_ );
852 =head2 add_table
854 Title : add_table
855 Usage : $newid = $ct->add_table($name, $table, $starts)
856 Function: Add a custom Codon Table into the object.
857 Know what you are doing, only the length of
858 the argument strings is checked!
859 Returns : the id of the new codon table
860 Args : name, a string, optional (can be empty)
861 table, a string of 64 characters
862 startcodons, a string of 64 characters, defaults to standard
864 =cut
866 sub add_table {
867 my ($self, $name, $table, $starts) = @_;
869 $name ||= 'Custom' . $#NAMES + 1;
870 $starts ||= $STARTS[1];
871 $self->throw('Suspect input!')
872 unless length($table) == 64 and length($starts) == 64;
874 push @NAMES, $name;
875 push @TABLES, $table;
876 push @STARTS, $starts;
878 return $#NAMES;
881 sub _make_iupac_string {
882 my ($self, $cod_ref) = @_;
883 if(ref($cod_ref) ne 'ARRAY') {
884 $self->throw(" I need a reference to a list of references to codons, ".
885 " not a [". ref($cod_ref) . "].");
887 my %iupac_hash = Bio::Tools::IUPAC->iupac_rev_iub();
888 my $iupac_string = ''; ## the string to be returned
889 for my $aa (@$cod_ref) {
891 ## scan through codon positions, record the differing values,
892 # then look up in the iub hash
893 for my $index(0..2) {
894 my %h;
895 map { my $k = substr($_,$index,1);
896 $h{$k} = undef;} @$aa;
897 my $lookup_key = join '', sort{$a cmp $b}keys %h;
899 ## extend string
900 $iupac_string .= $iupac_hash{uc$lookup_key};
903 return $iupac_string;