lib/Bio/Tools/CodonTable.pm

   1 #
   2 # bioperl module for Bio::Tools::CodonTable
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   7 #
   8 # Copyright Heikki Lehvaslaiho
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::CodonTable - Codon table object
  17
  18 =head1 SYNOPSIS
  19
  20   # This is a read-only class for all known codon tables.  The IDs are
  21   # the ones used by nucleotide sequence databases.  All common IUPAC
  22   # ambiguity codes for DNA, RNA and amino acids are recognized.
  23
  24   use Bio::Tools::CodonTable;
  25
  26   # defaults to ID 1 "Standard"
  27   $myCodonTable   = Bio::Tools::CodonTable->new();
  28   $myCodonTable2  = Bio::Tools::CodonTable->new( -id => 3 );
  29
  30   # change codon table
  31   $myCodonTable->id(5);
  32
  33   # examine codon table
  34   print  join (' ', "The name of the codon table no.", $myCodonTable->id(4),
  35            "is:", $myCodonTable->name(), "\n");
  36
  37   # print possible codon tables
  38   $tables = Bio::Tools::CodonTable->tables;
  39   while ( ($id,$name) = each %{$tables} ) {
  40     print "$id = $name\n";
  41   }
  42
  43   # translate a codon
  44   $aa = $myCodonTable->translate('ACU');
  45   $aa = $myCodonTable->translate('act');
  46   $aa = $myCodonTable->translate('ytr');
  47
  48   # reverse translate an amino acid
  49   @codons = $myCodonTable->revtranslate('A');
  50   @codons = $myCodonTable->revtranslate('Ser');
  51   @codons = $myCodonTable->revtranslate('Glx');
  52   @codons = $myCodonTable->revtranslate('cYS', 'rna');
  53
  54   # reverse translate an entire amino acid sequence into a IUPAC
  55   # nucleotide string
  56
  57   my $seqobj    = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
  58   my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
  59
  60   # boolean tests
  61   print "Is a start\n"       if $myCodonTable->is_start_codon('ATG');
  62   print "Is a terminator\n"  if $myCodonTable->is_ter_codon('tar');
  63   print "Is a unknown\n"     if $myCodonTable->is_unknown_codon('JTG');
  64
  65 =head1 DESCRIPTION
  66
  67 Codon tables are also called translation tables or genetic codes
  68 since that is what they represent. A bit more complete picture
  69 of the full complexity of codon usage in various taxonomic groups
  70 is presented at the NCBI Genetic Codes Home page.
  71
  72 CodonTable is a BioPerl class that knows all current translation
  73 tables that are used by primary nucleotide sequence databases
  74 (GenBank, EMBL and DDBJ). It provides methods to output information
  75 about tables and relationships between codons and amino acids.
  76
  77 This class and its methods recognized all common IUPAC ambiguity codes
  78 for DNA, RNA and animo acids. The translation method follows the
  79 conventions in EMBL and TREMBL databases.
  80
  81 It is a nuisance to separate RNA and cDNA representations of nucleic
  82 acid transcripts. The CodonTable object accepts codons of both type as
  83 input and allows the user to set the mode for output when reverse
  84 translating. Its default for output is DNA.
  85
  86 Note:
  87
  88 This class deals primarily with individual codons and amino
  89 acids. However in the interest of speed you can L<translate>
  90 longer sequence, too. The full complexity of protein translation
  91 is tackled by L<Bio::PrimarySeqI::translate>.
  92
  93
  94 The amino acid codes are IUPAC recommendations for common amino acids:
  95
  96           A           Ala            Alanine
  97           R           Arg            Arginine
  98           N           Asn            Asparagine
  99           D           Asp            Aspartic acid
 100           C           Cys            Cysteine
 101           Q           Gln            Glutamine
 102           E           Glu            Glutamic acid
 103           G           Gly            Glycine
 104           H           His            Histidine
 105           I           Ile            Isoleucine
 106           L           Leu            Leucine
 107           K           Lys            Lysine
 108           M           Met            Methionine
 109           F           Phe            Phenylalanine
 110           P           Pro            Proline
 111           O           Pyl            Pyrrolysine (22nd amino acid)
 112           U           Sec            Selenocysteine (21st amino acid)
 113           S           Ser            Serine
 114           T           Thr            Threonine
 115           W           Trp            Tryptophan
 116           Y           Tyr            Tyrosine
 117           V           Val            Valine
 118           B           Asx            Aspartic acid or Asparagine
 119           Z           Glx            Glutamine or Glutamic acid
 120           J           Xle            Isoleucine or Valine (mass spec ambiguity)
 121           X           Xaa            Any or unknown amino acid
 122
 123
 124 It is worth noting that, "Bacterial" codon table no. 11 produces an
 125 polypeptide that is, confusingly, identical to the standard one. The
 126 only differences are in available initiator codons.
 127
 128
 129 NCBI Genetic Codes home page:
 130      (Last update of the Genetic Codes: Nov. 18, 2016)
 131      https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
 132
 133 ASN.1 version with ids 1 to 25 is at:
 134      ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
 135
 136 Thanks to Matteo diTomasso for the original Perl implementation
 137 of these tables.
 138
 139 =head1 FEEDBACK
 140
 141 =head2 Mailing Lists
 142
 143 User feedback is an integral part of the evolution of this and other
 144 Bioperl modules. Send your comments and suggestions preferably to the
 145 Bioperl mailing lists  Your participation is much appreciated.
 146
 147   bioperl-l@bioperl.org                  - General discussion
 148   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 149
 150 =head2 Support
 151
 152 Please direct usage questions or support issues to the mailing list:
 153
 154 I<bioperl-l@bioperl.org>
 155
 156 rather than to the module maintainer directly. Many experienced and
 157 reponsive experts will be able look at the problem and quickly
 158 address it. Please include a thorough description of the problem
 159 with code and data examples if at all possible.
 160
 161 =head2 Reporting Bugs
 162
 163 Report bugs to the Bioperl bug tracking system to help us keep track
 164 the bugs and their resolution.  Bug reports can be submitted via the
 165 web:
 166
 167   https://github.com/bioperl/bioperl-live/issues
 168
 169 =head1 AUTHOR - Heikki Lehvaslaiho
 170
 171 Email:  heikki-at-bioperl-dot-org
 172
 173 =head1 APPENDIX
 174
 175 The rest of the documentation details each of the object
 176 methods. Internal methods are usually preceded with a _
 177
 178 =cut
 179
 180 # Let the code begin...
 181
 182 package Bio::Tools::CodonTable;
 183
 184 use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
 185                 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
 186 use strict;
 187
 188 # Object preamble - inherits from Bio::Root::Root
 189 use Bio::Tools::IUPAC;
 190 use Bio::SeqUtils;
 191
 192 use base qw(Bio::Root::Root);
 193
 194
 195 # first set internal values for all translation tables
 196
 197 BEGIN {
 198     use constant CODONSIZE => 3;
 199     $GAP = '-';
 200     $CODONGAP = $GAP x CODONSIZE;
 201
 202     @NAMES =            #id
 203     (
 204      'Strict', #0, special option for ATG-only start
 205      'Standard',        #1
 206      'Vertebrate Mitochondrial',#2
 207      'Yeast Mitochondrial',# 3
 208      'Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma',#4
 209      'Invertebrate Mitochondrial',#5
 210      'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
 211      '', '',
 212      'Echinoderm and Flatworm Mitochondrial',#9
 213      'Euplotid Nuclear',#10
 214      'Bacterial, Archaeal and Plant Plastid',# 11
 215      'Alternative Yeast Nuclear',# 12
 216      'Ascidian Mitochondrial',# 13
 217      'Alternative Flatworm Mitochondrial',# 14
 218      '',
 219      'Chlorophycean Mitochondrial',# 16
 220      '', '',  '', '',
 221      'Trematode Mitochondrial',# 21
 222      'Scenedesmus obliquus Mitochondrial', #22
 223      'Thraustochytrium Mitochondrial', #23
 224      'Pterobranchia Mitochondrial', #24
 225      'Candidate Division SR1 and Gracilibacteria', #25
 226      'Pachysolen tannophilus Nuclear Code',        #26
 227      'Karyorelict Nuclear', #27
 228      'Condylostoma Nuclear', #28
 229      'Mesodinium Nuclear', #29
 230      'Peritrich Nuclear', #30
 231      'Blastocrithidia Nuclear' #31
 232  );
 233
 234     @TABLES =
 235     qw(
 236        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 237        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 238        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
 239        FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 240        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 241        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
 242        FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 243        '' ''
 244        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 245        FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 246        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 247        FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 248        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
 249        FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 250        ''
 251        FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 252        '' '' '' ''
 253        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 254        FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 255        FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 256        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG
 257        FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 258        FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 259        FFLLSSSSYYQQCCWWLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 260        FFLLSSSSYYQQCCWWLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 261        FFLLSSSSYYYYCC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 262        FFLLSSSSYYEECC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 263        FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 264   );
 265
 266     #           (bases used for these tables, for reference)
 267     # 1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 268     # 2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 269     # 3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 270
 271     @STARTS =
 272         qw(
 273            ----------**--*--------------------M----------------------------
 274            ---M------**--*----M---------------M----------------------------
 275            ----------**--------------------MMMM----------**---M------------
 276            ----------**----------------------MM----------------------------
 277            --MM------**-------M------------MMMM---------------M------------
 278            ---M------**--------------------MMMM---------------M------------
 279            --------------*--------------------M----------------------------
 280            '' ''
 281            ----------**-----------------------M---------------M------------
 282            ----------**-----------------------M----------------------------
 283            ---M------**--*----M------------MMMM---------------M------------
 284            ----------**--*----M---------------M----------------------------
 285            ---M------**----------------------MM---------------M------------
 286            -----------*-----------------------M----------------------------
 287            ''
 288            ----------*---*--------------------M----------------------------
 289            '' ''  '' ''
 290            ----------**-----------------------M---------------M------------
 291            ------*---*---*--------------------M----------------------------
 292            --*-------**--*-----------------M--M---------------M------------
 293            ---M------**-------M---------------M---------------M------------
 294            ---M------**-----------------------M---------------M------------
 295            ----------**--*----M---------------M----------------------------
 296            --------------*--------------------M----------------------------
 297            ----------**--*--------------------M----------------------------
 298            --------------*--------------------M----------------------------
 299            --------------*--------------------M----------------------------
 300            ----------**-----------------------M----------------------------
 301       );
 302
 303     my @nucs = qw(t c a g);
 304     my $x = 0;
 305     ($CODONS, $TRCOL) = ({}, {});
 306     for my $i (@nucs) {
 307         for my $j (@nucs) {
 308             for my $k (@nucs) {
 309                 my $codon = "$i$j$k";
 310                 $CODONS->{$codon} = $x;
 311                 $TRCOL->{$x} = $codon;
 312                 $x++;
 313             }
 314         }
 315     }
 316     %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
 317     %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
 318     %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
 319     $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
 320     $TERMINATOR = '*';
 321 }
 322
 323 sub new {
 324     my($class,@args) = @_;
 325     my $self = $class->SUPER::new(@args);
 326
 327     my($id) =
 328         $self->_rearrange([qw(ID
 329                  )],
 330              @args);
 331
 332     $id = 1 if ( ! defined ( $id ) );
 333     $id  && $self->id($id);
 334     return $self; # success - we hope!
 335 }
 336
 337 =head2 id
 338
 339  Title   : id
 340  Usage   : $obj->id(3); $id_integer = $obj->id();
 341  Function: Sets or returns the id of the translation table.  IDs are
 342            integers from 0 (special ATG-only start) to 25, excluding
 343            7-8 and 17-20 which have been removed. If an invalid ID is
 344            given the method returns 1, the standard table.
 345  Example :
 346  Returns : value of id, a scalar, warn and fall back to 1 (standard table)
 347            if specified id is not valid
 348  Args    : newvalue (optional)
 349
 350 =cut
 351
 352 sub id{
 353     my ($self,$value) = @_;
 354     if( defined $value) {
 355         if (  not defined $TABLES[$value] or $TABLES[$value] eq '') {
 356             $self->warn("Not a valid codon table ID [$value], using [1] instead ");
 357             $value = 1;
 358         }
 359         $self->{'id'} = $value;
 360     }
 361     return $self->{'id'};
 362 }
 363
 364 =head2 name
 365
 366  Title   : name
 367  Usage   : $obj->name()
 368  Function: returns the descriptive name of the translation table
 369  Example :
 370  Returns : A string
 371  Args    : None
 372
 373
 374 =cut
 375
 376 sub name{
 377    my ($self) = @_;
 378
 379    my ($id) = $self->{'id'};
 380    return $NAMES[$id];
 381 }
 382
 383 =head2 tables
 384
 385  Title   : tables
 386  Usage   : $obj->tables()  or  Bio::Tools::CodonTable->tables()
 387  Function: returns a hash reference where each key is a valid codon
 388            table id() number, and each value is the corresponding
 389            codon table name() string
 390  Example :
 391  Returns : A hashref
 392  Args    : None
 393
 394
 395 =cut
 396
 397 sub tables{
 398   my %tables;
 399   for my $id (0 .. $#NAMES) {
 400     my $name = $NAMES[$id];
 401     $tables{$id} = $name if $name;
 402   }
 403   return \%tables;
 404 }
 405
 406 =head2 translate
 407
 408  Title   : translate
 409  Usage   : $obj->translate('YTR')
 410  Function: Returns a string of one letter amino acid codes from
 411            nucleotide sequence input. The imput can be of any length.
 412
 413            Returns 'X' for unknown codons and codons that code for
 414            more than one amino acid. Returns an empty string if input
 415            is not three characters long. Exceptions for these are:
 416
 417              - IUPAC amino acid code B for Aspartic Acid and
 418                Asparagine, is used.
 419              - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
 420                used.
 421              - if the codon is two nucleotides long and if by adding
 422                an a third character 'N', it codes for a single amino
 423                acid (with exceptions above), return that, otherwise
 424                return empty string.
 425
 426            Returns empty string for other input strings that are not
 427            three characters long.
 428
 429  Example :
 430  Returns : a string of one letter ambiguous IUPAC amino acid codes
 431  Args    : ambiguous IUPAC nucleotide string
 432
 433
 434 =cut
 435
 436 sub translate {
 437     my ($self, $seq, $complete_codon) = @_;
 438     $self->throw("Calling translate without a seq argument!") unless defined $seq;
 439     return '' unless $seq;
 440
 441     my $id = $self->id;
 442     my ($partial) = 0;
 443     $partial = 2 if length($seq) % CODONSIZE == 2;
 444
 445     $seq = lc $seq;
 446     $seq =~ tr/u/t/;
 447     my $protein = "";
 448     if ($seq =~ /[^actg]/ ) { #ambiguous chars
 449         for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) {
 450             my $triplet = substr($seq, $i, CODONSIZE);
 451             if( $triplet eq $CODONGAP ) {
 452                 $protein .= $GAP;
 453             } elsif (exists $CODONS->{$triplet}) {
 454                 $protein .= substr($TABLES[$id],
 455                            $CODONS->{$triplet},1);
 456             } else {
 457                 $protein .= $self->_translate_ambiguous_codon($triplet);
 458             }
 459         }
 460     } else { # simple, strict translation
 461         for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) {
 462             my $triplet = substr($seq, $i, CODONSIZE);
 463             if( $triplet eq $CODONGAP ) {
 464                 $protein .= $GAP;
 465             }
 466             if (exists $CODONS->{$triplet}) {
 467                 $protein .= substr($TABLES[$id], $CODONS->{$triplet}, 1);
 468             } else {
 469                 $protein .= 'X';
 470             }
 471         }
 472     }
 473     if ($partial == 2 && $complete_codon) { # 2 overhanging nucleotides
 474         my $triplet = substr($seq, ($partial -4)). "n";
 475         if( $triplet eq $CODONGAP ) {
 476             $protein .= $GAP;
 477         } elsif (exists $CODONS->{$triplet}) {
 478             my $aa = substr($TABLES[$id], $CODONS->{$triplet},1);
 479             $protein .= $aa;
 480         } else {
 481             $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
 482         }
 483     }
 484     return $protein;
 485 }
 486
 487 sub _translate_ambiguous_codon {
 488     my ($self, $triplet, $partial) = @_;
 489     $partial ||= 0;
 490     my $id = $self->id;
 491     my $aa;
 492     my @codons = $self->unambiguous_codons($triplet);
 493     my %aas =();
 494     foreach my $codon (@codons) {
 495         $aas{substr($TABLES[$id],$CODONS->{$codon},1)} = 1;
 496     }
 497     my $count = scalar keys %aas;
 498     if ( $count == 1 ) {
 499         $aa = (keys %aas)[0];
 500     }
 501     elsif ( $count == 2 ) {
 502         if ($aas{'D'} and $aas{'N'}) {
 503             $aa = 'B';
 504         }
 505         elsif ($aas{'E'} and $aas{'Q'}) {
 506             $aa = 'Z';
 507         } else {
 508             $partial ? ($aa = '') : ($aa = 'X');
 509         }
 510     } else {
 511         $partial ? ($aa = '') :  ($aa = 'X');
 512     }
 513     return $aa;
 514 }
 515
 516 =head2 translate_strict
 517
 518  Title   : translate_strict
 519  Usage   : $obj->translate_strict('ACT')
 520  Function: returns one letter amino acid code for a codon input
 521
 522            Fast and simple translation. User is responsible to resolve
 523            ambiguous nucleotide codes before calling this
 524            method. Returns 'X' for unknown codons and an empty string
 525            for input strings that are not three characters long.
 526
 527            It is not recommended to use this method in a production
 528            environment. Use method translate, instead.
 529
 530  Example :
 531  Returns : A string
 532  Args    : a codon = a three nucleotide character string
 533
 534
 535 =cut
 536
 537 sub translate_strict{
 538    my ($self, $value) = @_;
 539    my $id = $self->{'id'};
 540
 541    $value  = lc $value;
 542    $value  =~ tr/u/t/;
 543
 544    return '' unless length $value == 3;
 545
 546    return 'X' unless defined $CODONS->{$value};
 547
 548    return substr( $TABLES[$id], $CODONS->{$value}, 1 );
 549 }
 550
 551 =head2 revtranslate
 552
 553  Title   : revtranslate
 554  Usage   : $obj->revtranslate('G')
 555  Function: returns codons for an amino acid
 556
 557            Returns an empty string for unknown amino acid
 558            codes. Ambiguous IUPAC codes Asx,B, (Asp,D; Asn,N) and
 559            Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
 560            letter amino acid codes are accepted. '*' and 'Ter' are
 561            used for terminator.
 562
 563            By default, the output codons are shown in DNA.  If the
 564            output is needed in RNA (tr/t/u/), add a second argument
 565            'RNA'.
 566
 567  Example : $obj->revtranslate('Gly', 'RNA')
 568  Returns : An array of three lower case letter strings i.e. codons
 569  Args    : amino acid, 'RNA'
 570
 571 =cut
 572
 573 sub revtranslate {
 574     my ($self, $value, $coding) = @_;
 575     my @codons;
 576
 577     if (length($value) == 3 ) {
 578         $value = lc $value;
 579         $value = ucfirst $value;
 580         $value = $THREELETTERSYMBOLS{$value};
 581     }
 582     if (    defined $value and $value =~ /$VALID_PROTEIN/
 583         and length($value) == 1
 584         ) {
 585         my $id = $self->{'id'};
 586
 587         $value = uc $value;
 588         my @aas = @{$IUPAC_AA{$value}};
 589         foreach my $aa (@aas) {
 590             #print $aa, " -2\n";
 591             $aa = '\*' if $aa eq '*';
 592             while ($TABLES[$id] =~ m/$aa/g) {
 593                 my $p = pos $TABLES[$id];
 594                 push (@codons, $TRCOL->{--$p});
 595             }
 596         }
 597     }
 598
 599     if ($coding and uc ($coding) eq 'RNA') {
 600         for my $i (0..$#codons)  {
 601             $codons[$i] =~ tr/t/u/;
 602         }
 603     }
 604
 605    return @codons;
 606 }
 607
 608 =head2 reverse_translate_all
 609
 610  Title   : reverse_translate_all
 611  Usage   : my $iup_str = $cttable->reverse_translate_all($seq_object)
 612            my $iup_str = $cttable->reverse_translate_all($seq_object,
 613                                                          $cutable,
 614                                                          15);
 615  Function: reverse translates a protein sequence into IUPAC nucleotide
 616            sequence. An 'X' in the protein sequence is converted to 'NNN'
 617            in the nucleotide sequence.
 618  Returns : a string
 619  Args    : a Bio::PrimarySeqI compatible object (mandatory)
 620            a Bio::CodonUsage::Table object and a threshold if only
 621              codons with a relative frequency above the threshold are
 622              to be considered.
 623 =cut
 624
 625 sub reverse_translate_all {
 626     my ($self, $obj, $cut, $threshold) = @_;
 627
 628     ## check args are OK
 629
 630     if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 631         $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 632                         ref($obj) . "]");
 633         }
 634     if($obj->alphabet ne 'protein') {
 635         $self->throw("Cannot reverse translate, need an amino acid sequence .".
 636                      "This sequence is of type [" . $obj->alphabet ."]");
 637         }
 638     my @data;
 639     my @seq = split '', $obj->seq;
 640
 641     ## if we're not supplying a codon usage table...
 642     if( !$cut && !$threshold) {
 643         ## get lists of possible codons for each aa.
 644         for my $aa (@seq) {
 645             if ($aa =~ /x/i) {
 646                 push @data, (['NNN']);
 647             }else {
 648                 my @cods = $self->revtranslate($aa);
 649                 push @data, \@cods;
 650             }
 651         }
 652     }else{
 653     #else we are supplying a codon usage table, we just want common codons
 654     #check args first.
 655         if(!$cut->isa('Bio::CodonUsage::Table'))    {
 656             $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 657                      ref($cut). "].");
 658             }
 659         my $cod_ref = $cut->probable_codons($threshold);
 660         for my $aa (@seq) {
 661             if ($aa =~ /x/i) {
 662                 push @data, (['NNN']);
 663                 next;
 664                 }
 665             push @data, $cod_ref->{$aa};
 666         }
 667     }
 668
 669     return $self->_make_iupac_string(\@data);
 670 }
 671
 672 =head2 reverse_translate_best
 673
 674  Title   : reverse_translate_best
 675  Usage   : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
 676  Function: Reverse translates a protein sequence into plain nucleotide
 677            sequence (GATC), uses the most common codon for each amino acid
 678  Returns : A string
 679  Args    : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
 680
 681 =cut
 682
 683 sub reverse_translate_best {
 684
 685     my ($self, $obj, $cut) = @_;
 686
 687     if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 688         $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 689                          ref($obj) . "]");
 690     }
 691     if ($obj->alphabet ne 'protein')    {
 692         $self->throw("Cannot reverse translate, need an amino acid sequence .".
 693                          "This sequence is of type [" . $obj->alphabet ."]");
 694     }
 695     if ( !$cut | !$cut->isa('Bio::CodonUsage::Table'))  {
 696         $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 697                          ref($cut). "].");
 698     }
 699
 700     my $str = '';
 701     my @seq = split '', $obj->seq;
 702
 703     my $cod_ref = $cut->most_common_codons();
 704
 705     for my $aa ( @seq ) {
 706         if ($aa =~ /x/i) {
 707             $str .= 'NNN';
 708             next;
 709         }
 710         if ( defined $cod_ref->{$aa} ) {
 711             $str .= $cod_ref->{$aa};
 712         } else {
 713             $self->throw("Input sequence contains invalid character: $aa");
 714         }
 715     }
 716    return $str;
 717 }
 718
 719 =head2 is_start_codon
 720
 721  Title   : is_start_codon
 722  Usage   : $obj->is_start_codon('ATG')
 723  Function: returns true (1) for all codons that can be used as a
 724            translation start, false (0) for others.
 725  Example : $myCodonTable->is_start_codon('ATG')
 726  Returns : boolean
 727  Args    : codon
 728
 729 =cut
 730
 731 sub is_start_codon{
 732    shift->_codon_is( shift, \@STARTS, 'M' );
 733 }
 734
 735 =head2 is_ter_codon
 736
 737  Title   : is_ter_codon
 738  Usage   : $obj->is_ter_codon('GAA')
 739  Function: returns true (1) for all codons that can be used as a
 740            translation tarminator, false (0) for others.
 741  Example : $myCodonTable->is_ter_codon('ATG')
 742  Returns : boolean
 743  Args    : codon
 744
 745 =cut
 746
 747 sub is_ter_codon{
 748    my ($self, $value) = @_;
 749    my $id = $self->{'id'};
 750
 751    # We need to ensure U is mapped to T (ie. UAG)
 752    $value = uc $value;
 753    $value =~ tr/U/T/;
 754
 755    if (length $value != 3  )  {
 756        # Incomplete codons are not stop codons
 757        return 0;
 758    } else {
 759        my $result = 0;
 760
 761        # For all the possible codons, if any are not a stop
 762        # codon, fail immediately
 763        for my $c ( $self->unambiguous_codons($value) ) {
 764            my $m = substr( $TABLES[$id], $CODONS->{$c}, 1 );
 765            if($m eq $TERMINATOR) {
 766                $result = 1;
 767            } else {
 768                return 0;
 769            }
 770        }
 771        return $result;
 772    }
 773 }
 774
 775 # desc: compares the passed value with a single entry in the given
 776 #       codon table
 777 # args: a value (typically a three-char string like 'atg'),
 778 #       a reference to the appropriate set of codon tables,
 779 #       a single-character value to check for at the position in the
 780 #       given codon table
 781 # ret:  boolean, true if the given codon table contains the $key at the
 782 #       position corresponding to $value
 783 sub _codon_is {
 784    my ($self, $value, $table, $key ) = @_;
 785
 786    return 0 unless length $value == 3;
 787
 788    $value  = lc $value;
 789    $value  =~ tr/u/t/;
 790
 791    my $id = $self->{'id'};
 792    for my $c ( $self->unambiguous_codons($value) ) {
 793        my $m = substr( $table->[$id], $CODONS->{$c}, 1 );
 794        if ($m eq $key) { return 1; }
 795    }
 796    return 0;
 797 }
 798
 799 =head2 is_unknown_codon
 800
 801  Title   : is_unknown_codon
 802  Usage   : $obj->is_unknown_codon('GAJ')
 803  Function: returns false (0) for all codons that are valid,
 804         true (1) for others.
 805  Example : $myCodonTable->is_unknown_codon('NTG')
 806  Returns : boolean
 807  Args    : codon
 808
 809
 810 =cut
 811
 812 sub is_unknown_codon{
 813    my ($self, $value) = @_;
 814    $value  = lc $value;
 815    $value  =~ tr/u/t/;
 816    return 1 unless $self->unambiguous_codons($value);
 817    return 0;
 818 }
 819
 820 =head2 unambiguous_codons
 821
 822  Title   : unambiguous_codons
 823  Usage   : @codons = $self->unambiguous_codons('ACN')
 824  Returns : array of strings (one-letter unambiguous amino acid codes)
 825  Args    : a codon = a three IUPAC nucleotide character string
 826
 827 =cut
 828
 829 sub unambiguous_codons{
 830     my ($self,$value) = @_;
 831     my @nts = map { $IUPAC_DNA{uc $_} }  split(//, $value);
 832
 833     my @codons;
 834     for my $i ( @{$nts[0]} ) {
 835     for my $j ( @{$nts[1]} ) {
 836     for my $k ( @{$nts[2]} ) {
 837         push @codons, lc "$i$j$k";
 838     }}}
 839     return @codons;
 840 }
 841
 842 =head2 _unambiquous_codons
 843
 844 deprecated, now an alias for unambiguous_codons
 845
 846 =cut
 847
 848 sub _unambiquous_codons {
 849     unambiguous_codons( undef, @_ );
 850 }
 851
 852 =head2 add_table
 853
 854  Title   : add_table
 855  Usage   : $newid = $ct->add_table($name, $table, $starts)
 856  Function: Add a custom Codon Table into the object.
 857            Know what you are doing, only the length of
 858            the argument strings is checked!
 859  Returns : the id of the new codon table
 860  Args    : name, a string, optional (can be empty)
 861            table, a string of 64 characters
 862            startcodons, a string of 64 characters, defaults to standard
 863
 864 =cut
 865
 866 sub add_table {
 867     my ($self, $name, $table, $starts) = @_;
 868
 869     $name   ||= 'Custom' . $#NAMES + 1;
 870     $starts ||= $STARTS[1];
 871     $self->throw('Suspect input!')
 872         unless length($table) == 64 and length($starts) == 64;
 873
 874     push @NAMES,  $name;
 875     push @TABLES, $table;
 876     push @STARTS, $starts;
 877
 878     return $#NAMES;
 879 }
 880
 881 sub _make_iupac_string {
 882     my ($self, $cod_ref) = @_;
 883     if(ref($cod_ref) ne 'ARRAY') {
 884         $self->throw(" I need a reference to a list of references to codons, ".
 885                      " not a [". ref($cod_ref) . "].");
 886         }
 887     my %iupac_hash   = Bio::Tools::IUPAC->iupac_rev_iub();
 888     my $iupac_string = ''; ## the string to be returned
 889     for my $aa (@$cod_ref) {
 890
 891         ## scan through codon positions, record the differing values,
 892         # then look up in the iub hash
 893         for my $index(0..2) {
 894             my %h;
 895             map { my $k = substr($_,$index,1);
 896                 $h{$k}  = undef;} @$aa;
 897             my $lookup_key = join '', sort{$a cmp $b}keys %h;
 898
 899             ## extend string
 900             $iupac_string .= $iupac_hash{uc$lookup_key};
 901         }
 902     }
 903     return $iupac_string;
 904 }
 905
 906
 907 1;