lib/Bio/Tools/CodonTable.pm

   1 #
   2 # bioperl module for Bio::Tools::CodonTable
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   7 #
   8 # Copyright Heikki Lehvaslaiho
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::CodonTable - Codon table object
  17
  18 =head1 SYNOPSIS
  19
  20   # This is a read-only class for all known codon tables.  The IDs are
  21   # the ones used by nucleotide sequence databases.  All common IUPAC
  22   # ambiguity codes for DNA, RNA and amino acids are recognized.
  23
  24   use Bio::Tools::CodonTable;
  25
  26   # defaults to ID 1 "Standard"
  27   $myCodonTable   = Bio::Tools::CodonTable->new();
  28   $myCodonTable2  = Bio::Tools::CodonTable->new( -id => 3 );
  29
  30   # change codon table
  31   $myCodonTable->id(5);
  32
  33   # examine codon table
  34   print  join (' ', "The name of the codon table no.", $myCodonTable->id(4),
  35            "is:", $myCodonTable->name(), "\n");
  36
  37   # print possible codon tables
  38   $tables = Bio::Tools::CodonTable->tables;
  39   while ( ($id,$name) = each %{$tables} ) {
  40     print "$id = $name\n";
  41   }
  42
  43   # translate a codon
  44   $aa = $myCodonTable->translate('ACU');
  45   $aa = $myCodonTable->translate('act');
  46   $aa = $myCodonTable->translate('ytr');
  47
  48   # reverse translate an amino acid
  49   @codons = $myCodonTable->revtranslate('A');
  50   @codons = $myCodonTable->revtranslate('Ser');
  51   @codons = $myCodonTable->revtranslate('Glx');
  52   @codons = $myCodonTable->revtranslate('cYS', 'rna');
  53
  54   # reverse translate an entire amino acid sequence into a IUPAC
  55   # nucleotide string
  56
  57   my $seqobj    = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
  58   my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
  59
  60   # boolean tests
  61   print "Is a start\n"       if $myCodonTable->is_start_codon('ATG');
  62   print "Is a terminator\n"  if $myCodonTable->is_ter_codon('tar');
  63   print "Is a unknown\n"     if $myCodonTable->is_unknown_codon('JTG');
  64
  65 =head1 DESCRIPTION
  66
  67 Codon tables are also called translation tables or genetic codes
  68 since that is what they represent. A bit more complete picture
  69 of the full complexity of codon usage in various taxonomic groups
  70 is presented at the NCBI Genetic Codes Home page.
  71
  72 CodonTable is a BioPerl class that knows all current translation
  73 tables that are used by primary nucleotide sequence databases
  74 (GenBank, EMBL and DDBJ). It provides methods to output information
  75 about tables and relationships between codons and amino acids.
  76
  77 This class and its methods recognized all common IUPAC ambiguity codes
  78 for DNA, RNA and animo acids. The translation method follows the
  79 conventions in EMBL and TREMBL databases.
  80
  81 It is a nuisance to separate RNA and cDNA representations of nucleic
  82 acid transcripts. The CodonTable object accepts codons of both type as
  83 input and allows the user to set the mode for output when reverse
  84 translating. Its default for output is DNA.
  85
  86 Note:
  87
  88 This class deals primarily with individual codons and amino
  89 acids. However in the interest of speed you can L<translate>
  90 longer sequence, too. The full complexity of protein translation
  91 is tackled by L<Bio::PrimarySeqI::translate>.
  92
  93
  94 The amino acid codes are IUPAC recommendations for common amino acids:
  95
  96           A           Ala            Alanine
  97           R           Arg            Arginine
  98           N           Asn            Asparagine
  99           D           Asp            Aspartic acid
 100           C           Cys            Cysteine
 101           Q           Gln            Glutamine
 102           E           Glu            Glutamic acid
 103           G           Gly            Glycine
 104           H           His            Histidine
 105           I           Ile            Isoleucine
 106           L           Leu            Leucine
 107           K           Lys            Lysine
 108           M           Met            Methionine
 109           F           Phe            Phenylalanine
 110           P           Pro            Proline
 111           O           Pyl            Pyrrolysine (22nd amino acid)
 112           U           Sec            Selenocysteine (21st amino acid)
 113           S           Ser            Serine
 114           T           Thr            Threonine
 115           W           Trp            Tryptophan
 116           Y           Tyr            Tyrosine
 117           V           Val            Valine
 118           B           Asx            Aspartic acid or Asparagine
 119           Z           Glx            Glutamine or Glutamic acid
 120           J           Xle            Isoleucine or Valine (mass spec ambiguity)
 121           X           Xaa            Any or unknown amino acid
 122
 123
 124 It is worth noting that, "Bacterial" codon table no. 11 produces an
 125 polypeptide that is, confusingly, identical to the standard one. The
 126 only differences are in available initiator codons.
 127
 128
 129 NCBI Genetic Codes home page:
 130      (Last update of the Genetic Codes: Nov. 18, 2016)
 131      https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
 132
 133 ASN.1 version with ids 1 to 25 is at:
 134      ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
 135
 136 Thanks to Matteo diTomasso for the original Perl implementation
 137 of these tables.
 138
 139 =head1 FEEDBACK
 140
 141 =head2 Mailing Lists
 142
 143 User feedback is an integral part of the evolution of this and other
 144 Bioperl modules. Send your comments and suggestions preferably to the
 145 Bioperl mailing lists  Your participation is much appreciated.
 146
 147   bioperl-l@bioperl.org                  - General discussion
 148   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 149
 150 =head2 Support
 151
 152 Please direct usage questions or support issues to the mailing list:
 153
 154 I<bioperl-l@bioperl.org>
 155
 156 rather than to the module maintainer directly. Many experienced and
 157 reponsive experts will be able look at the problem and quickly
 158 address it. Please include a thorough description of the problem
 159 with code and data examples if at all possible.
 160
 161 =head2 Reporting Bugs
 162
 163 Report bugs to the Bioperl bug tracking system to help us keep track
 164 the bugs and their resolution.  Bug reports can be submitted via the
 165 web:
 166
 167   https://github.com/bioperl/bioperl-live/issues
 168
 169 =head1 AUTHOR - Heikki Lehvaslaiho
 170
 171 Email:  heikki-at-bioperl-dot-org
 172
 173 =head1 APPENDIX
 174
 175 The rest of the documentation details each of the object
 176 methods. Internal methods are usually preceded with a _
 177
 178 =cut
 179
 180 # Let the code begin...
 181
 182 package Bio::Tools::CodonTable;
 183 use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
 184                 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
 185 use strict;
 186
 187 # Object preamble - inherits from Bio::Root::Root
 188 use Bio::Tools::IUPAC;
 189 use Bio::SeqUtils;
 190
 191 use base qw(Bio::Root::Root);
 192
 193
 194 # first set internal values for all translation tables
 195
 196 BEGIN {
 197     use constant CODONSIZE => 3;
 198     $GAP = '-';
 199     $CODONGAP = $GAP x CODONSIZE;
 200
 201     @NAMES =            #id
 202     (
 203      'Strict', #0, special option for ATG-only start
 204      'Standard',        #1
 205      'Vertebrate Mitochondrial',#2
 206      'Yeast Mitochondrial',# 3
 207      'Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma',#4
 208      'Invertebrate Mitochondrial',#5
 209      'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
 210      '', '',
 211      'Echinoderm and Flatworm Mitochondrial',#9
 212      'Euplotid Nuclear',#10
 213      'Bacterial, Archaeal and Plant Plastid',# 11
 214      'Alternative Yeast Nuclear',# 12
 215      'Ascidian Mitochondrial',# 13
 216      'Alternative Flatworm Mitochondrial',# 14
 217      '',
 218      'Chlorophycean Mitochondrial',# 16
 219      '', '',  '', '',
 220      'Trematode Mitochondrial',# 21
 221      'Scenedesmus obliquus Mitochondrial', #22
 222      'Thraustochytrium Mitochondrial', #23
 223      'Pterobranchia Mitochondrial', #24
 224      'Candidate Division SR1 and Gracilibacteria', #25
 225      'Pachysolen tannophilus Nuclear Code',        #26
 226      'Karyorelict Nuclear', #27
 227      'Condylostoma Nuclear', #28
 228      'Mesodinium Nuclear', #29
 229      'Peritrich Nuclear', #30
 230      'Blastocrithidia Nuclear' #31
 231  );
 232
 233     @TABLES =
 234     qw(
 235        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 236        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 237        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
 238        FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 239        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 240        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
 241        FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 242        '' ''
 243        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 244        FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 245        FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 246        FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 247        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
 248        FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 249        ''
 250        FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 251        '' '' '' ''
 252        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
 253        FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 254        FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 255        FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG
 256        FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 257        FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 258        FFLLSSSSYYQQCCWWLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 259        FFLLSSSSYYQQCCWWLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 260        FFLLSSSSYYYYCC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 261        FFLLSSSSYYEECC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 262        FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
 263   );
 264
 265     #           (bases used for these tables, for reference)
 266     # 1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 267     # 2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 268     # 3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 269
 270     @STARTS =
 271         qw(
 272            ----------**--*--------------------M----------------------------
 273            ---M------**--*----M---------------M----------------------------
 274            ----------**--------------------MMMM----------**---M------------
 275            ----------**----------------------MM----------------------------
 276            --MM------**-------M------------MMMM---------------M------------
 277            ---M------**--------------------MMMM---------------M------------
 278            --------------*--------------------M----------------------------
 279            '' ''
 280            ----------**-----------------------M---------------M------------
 281            ----------**-----------------------M----------------------------
 282            ---M------**--*----M------------MMMM---------------M------------
 283            ----------**--*----M---------------M----------------------------
 284            ---M------**----------------------MM---------------M------------
 285            -----------*-----------------------M----------------------------
 286            ''
 287            ----------*---*--------------------M----------------------------
 288            '' ''  '' ''
 289            ----------**-----------------------M---------------M------------
 290            ------*---*---*--------------------M----------------------------
 291            --*-------**--*-----------------M--M---------------M------------
 292            ---M------**-------M---------------M---------------M------------
 293            ---M------**-----------------------M---------------M------------
 294            ----------**--*----M---------------M----------------------------
 295            --------------*--------------------M----------------------------
 296            ----------**--*--------------------M----------------------------
 297            --------------*--------------------M----------------------------
 298            --------------*--------------------M----------------------------
 299            ----------**-----------------------M----------------------------
 300       );
 301
 302     my @nucs = qw(t c a g);
 303     my $x = 0;
 304     ($CODONS, $TRCOL) = ({}, {});
 305     for my $i (@nucs) {
 306         for my $j (@nucs) {
 307             for my $k (@nucs) {
 308                 my $codon = "$i$j$k";
 309                 $CODONS->{$codon} = $x;
 310                 $TRCOL->{$x} = $codon;
 311                 $x++;
 312             }
 313         }
 314     }
 315     %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
 316     %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
 317     %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
 318     $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
 319     $TERMINATOR = '*';
 320 }
 321
 322 sub new {
 323     my($class,@args) = @_;
 324     my $self = $class->SUPER::new(@args);
 325
 326     my($id) =
 327         $self->_rearrange([qw(ID
 328                  )],
 329              @args);
 330
 331     $id = 1 if ( ! $id );
 332     $id  && $self->id($id);
 333     return $self; # success - we hope!
 334 }
 335
 336 =head2 id
 337
 338  Title   : id
 339  Usage   : $obj->id(3); $id_integer = $obj->id();
 340  Function: Sets or returns the id of the translation table.  IDs are
 341            integers from 0 (special ATG-only start) to 25, excluding
 342            7-8 and 17-20 which have been removed. If an invalid ID is
 343            given the method returns 1, the standard table.
 344  Example :
 345  Returns : value of id, a scalar, warn and fall back to 1 (standard table)
 346            if specified id is not valid
 347  Args    : newvalue (optional)
 348
 349 =cut
 350
 351 sub id{
 352     my ($self,$value) = @_;
 353     if( defined $value) {
 354         if (  not defined $TABLES[$value] or $TABLES[$value] eq '') {
 355             $self->warn("Not a valid codon table ID [$value], using [1] instead ");
 356             $value = 1;
 357         }
 358         $self->{'id'} = $value;
 359     }
 360     return $self->{'id'};
 361 }
 362
 363 =head2 name
 364
 365  Title   : name
 366  Usage   : $obj->name()
 367  Function: returns the descriptive name of the translation table
 368  Example :
 369  Returns : A string
 370  Args    : None
 371
 372
 373 =cut
 374
 375 sub name{
 376    my ($self) = @_;
 377
 378    my ($id) = $self->{'id'};
 379    return $NAMES[$id];
 380 }
 381
 382 =head2 tables
 383
 384  Title   : tables
 385  Usage   : $obj->tables()  or  Bio::Tools::CodonTable->tables()
 386  Function: returns a hash reference where each key is a valid codon
 387            table id() number, and each value is the corresponding
 388            codon table name() string
 389  Example :
 390  Returns : A hashref
 391  Args    : None
 392
 393
 394 =cut
 395
 396 sub tables{
 397   my %tables;
 398   for my $id (0 .. $#NAMES) {
 399     my $name = $NAMES[$id];
 400     $tables{$id} = $name if $name;
 401   }
 402   return \%tables;
 403 }
 404
 405 =head2 translate
 406
 407  Title   : translate
 408  Usage   : $obj->translate('YTR')
 409  Function: Returns a string of one letter amino acid codes from
 410            nucleotide sequence input. The imput can be of any length.
 411
 412            Returns 'X' for unknown codons and codons that code for
 413            more than one amino acid. Returns an empty string if input
 414            is not three characters long. Exceptions for these are:
 415
 416              - IUPAC amino acid code B for Aspartic Acid and
 417                Asparagine, is used.
 418              - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
 419                used.
 420              - if the codon is two nucleotides long and if by adding
 421                an a third character 'N', it codes for a single amino
 422                acid (with exceptions above), return that, otherwise
 423                return empty string.
 424
 425            Returns empty string for other input strings that are not
 426            three characters long.
 427
 428  Example :
 429  Returns : a string of one letter ambiguous IUPAC amino acid codes
 430  Args    : ambiguous IUPAC nucleotide string
 431
 432
 433 =cut
 434
 435 sub translate {
 436     my ($self, $seq, $complete_codon) = @_;
 437     $self->throw("Calling translate without a seq argument!") unless defined $seq;
 438     return '' unless $seq;
 439
 440     my $id = $self->id;
 441     my ($partial) = 0;
 442     $partial = 2 if length($seq) % CODONSIZE == 2;
 443
 444     $seq = lc $seq;
 445     $seq =~ tr/u/t/;
 446     my $protein = "";
 447     if ($seq =~ /[^actg]/ ) { #ambiguous chars
 448         for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) {
 449             my $triplet = substr($seq, $i, CODONSIZE);
 450             if( $triplet eq $CODONGAP ) {
 451                 $protein .= $GAP;
 452             } elsif (exists $CODONS->{$triplet}) {
 453                 $protein .= substr($TABLES[$id],
 454                            $CODONS->{$triplet},1);
 455             } else {
 456                 $protein .= $self->_translate_ambiguous_codon($triplet);
 457             }
 458         }
 459     } else { # simple, strict translation
 460         for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) {
 461             my $triplet = substr($seq, $i, CODONSIZE);
 462             if( $triplet eq $CODONGAP ) {
 463                 $protein .= $GAP;
 464             }
 465             if (exists $CODONS->{$triplet}) {
 466                 $protein .= substr($TABLES[$id], $CODONS->{$triplet}, 1);
 467             } else {
 468                 $protein .= 'X';
 469             }
 470         }
 471     }
 472     if ($partial == 2 && $complete_codon) { # 2 overhanging nucleotides
 473         my $triplet = substr($seq, ($partial -4)). "n";
 474         if( $triplet eq $CODONGAP ) {
 475             $protein .= $GAP;
 476         } elsif (exists $CODONS->{$triplet}) {
 477             my $aa = substr($TABLES[$id], $CODONS->{$triplet},1);
 478             $protein .= $aa;
 479         } else {
 480             $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
 481         }
 482     }
 483     return $protein;
 484 }
 485
 486 sub _translate_ambiguous_codon {
 487     my ($self, $triplet, $partial) = @_;
 488     $partial ||= 0;
 489     my $id = $self->id;
 490     my $aa;
 491     my @codons = $self->unambiguous_codons($triplet);
 492     my %aas =();
 493     foreach my $codon (@codons) {
 494         $aas{substr($TABLES[$id],$CODONS->{$codon},1)} = 1;
 495     }
 496     my $count = scalar keys %aas;
 497     if ( $count == 1 ) {
 498         $aa = (keys %aas)[0];
 499     }
 500     elsif ( $count == 2 ) {
 501         if ($aas{'D'} and $aas{'N'}) {
 502             $aa = 'B';
 503         }
 504         elsif ($aas{'E'} and $aas{'Q'}) {
 505             $aa = 'Z';
 506         } else {
 507             $partial ? ($aa = '') : ($aa = 'X');
 508         }
 509     } else {
 510         $partial ? ($aa = '') :  ($aa = 'X');
 511     }
 512     return $aa;
 513 }
 514
 515 =head2 translate_strict
 516
 517  Title   : translate_strict
 518  Usage   : $obj->translate_strict('ACT')
 519  Function: returns one letter amino acid code for a codon input
 520
 521            Fast and simple translation. User is responsible to resolve
 522            ambiguous nucleotide codes before calling this
 523            method. Returns 'X' for unknown codons and an empty string
 524            for input strings that are not three characters long.
 525
 526            It is not recommended to use this method in a production
 527            environment. Use method translate, instead.
 528
 529  Example :
 530  Returns : A string
 531  Args    : a codon = a three nucleotide character string
 532
 533
 534 =cut
 535
 536 sub translate_strict{
 537    my ($self, $value) = @_;
 538    my $id = $self->{'id'};
 539
 540    $value  = lc $value;
 541    $value  =~ tr/u/t/;
 542
 543    return '' unless length $value == 3;
 544
 545    return 'X' unless defined $CODONS->{$value};
 546
 547    return substr( $TABLES[$id], $CODONS->{$value}, 1 );
 548 }
 549
 550 =head2 revtranslate
 551
 552  Title   : revtranslate
 553  Usage   : $obj->revtranslate('G')
 554  Function: returns codons for an amino acid
 555
 556            Returns an empty string for unknown amino acid
 557            codes. Ambiguous IUPAC codes Asx,B, (Asp,D; Asn,N) and
 558            Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
 559            letter amino acid codes are accepted. '*' and 'Ter' are
 560            used for terminator.
 561
 562            By default, the output codons are shown in DNA.  If the
 563            output is needed in RNA (tr/t/u/), add a second argument
 564            'RNA'.
 565
 566  Example : $obj->revtranslate('Gly', 'RNA')
 567  Returns : An array of three lower case letter strings i.e. codons
 568  Args    : amino acid, 'RNA'
 569
 570 =cut
 571
 572 sub revtranslate {
 573     my ($self, $value, $coding) = @_;
 574     my @codons;
 575
 576     if (length($value) == 3 ) {
 577         $value = lc $value;
 578         $value = ucfirst $value;
 579         $value = $THREELETTERSYMBOLS{$value};
 580     }
 581     if (    defined $value and $value =~ /$VALID_PROTEIN/
 582         and length($value) == 1
 583         ) {
 584         my $id = $self->{'id'};
 585
 586         $value = uc $value;
 587         my @aas = @{$IUPAC_AA{$value}};
 588         foreach my $aa (@aas) {
 589             #print $aa, " -2\n";
 590             $aa = '\*' if $aa eq '*';
 591             while ($TABLES[$id] =~ m/$aa/g) {
 592                 my $p = pos $TABLES[$id];
 593                 push (@codons, $TRCOL->{--$p});
 594             }
 595         }
 596     }
 597
 598     if ($coding and uc ($coding) eq 'RNA') {
 599         for my $i (0..$#codons)  {
 600             $codons[$i] =~ tr/t/u/;
 601         }
 602     }
 603
 604    return @codons;
 605 }
 606
 607 =head2 reverse_translate_all
 608
 609  Title   : reverse_translate_all
 610  Usage   : my $iup_str = $cttable->reverse_translate_all($seq_object)
 611            my $iup_str = $cttable->reverse_translate_all($seq_object,
 612                                                          $cutable,
 613                                                          15);
 614  Function: reverse translates a protein sequence into IUPAC nucleotide
 615            sequence. An 'X' in the protein sequence is converted to 'NNN'
 616            in the nucleotide sequence.
 617  Returns : a string
 618  Args    : a Bio::PrimarySeqI compatible object (mandatory)
 619            a Bio::CodonUsage::Table object and a threshold if only
 620              codons with a relative frequency above the threshold are
 621              to be considered.
 622 =cut
 623
 624 sub reverse_translate_all {
 625     my ($self, $obj, $cut, $threshold) = @_;
 626
 627     ## check args are OK
 628
 629     if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 630         $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 631                         ref($obj) . "]");
 632         }
 633     if($obj->alphabet ne 'protein') {
 634         $self->throw("Cannot reverse translate, need an amino acid sequence .".
 635                      "This sequence is of type [" . $obj->alphabet ."]");
 636         }
 637     my @data;
 638     my @seq = split '', $obj->seq;
 639
 640     ## if we're not supplying a codon usage table...
 641     if( !$cut && !$threshold) {
 642         ## get lists of possible codons for each aa.
 643         for my $aa (@seq) {
 644             if ($aa =~ /x/i) {
 645                 push @data, (['NNN']);
 646             }else {
 647                 my @cods = $self->revtranslate($aa);
 648                 push @data, \@cods;
 649             }
 650         }
 651     }else{
 652     #else we are supplying a codon usage table, we just want common codons
 653     #check args first.
 654         if(!$cut->isa('Bio::CodonUsage::Table'))    {
 655             $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 656                      ref($cut). "].");
 657             }
 658         my $cod_ref = $cut->probable_codons($threshold);
 659         for my $aa (@seq) {
 660             if ($aa =~ /x/i) {
 661                 push @data, (['NNN']);
 662                 next;
 663                 }
 664             push @data, $cod_ref->{$aa};
 665         }
 666     }
 667
 668     return $self->_make_iupac_string(\@data);
 669 }
 670
 671 =head2 reverse_translate_best
 672
 673  Title   : reverse_translate_best
 674  Usage   : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
 675  Function: Reverse translates a protein sequence into plain nucleotide
 676            sequence (GATC), uses the most common codon for each amino acid
 677  Returns : A string
 678  Args    : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
 679
 680 =cut
 681
 682 sub reverse_translate_best {
 683
 684     my ($self, $obj, $cut) = @_;
 685
 686     if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 687         $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 688                          ref($obj) . "]");
 689     }
 690     if ($obj->alphabet ne 'protein')    {
 691         $self->throw("Cannot reverse translate, need an amino acid sequence .".
 692                          "This sequence is of type [" . $obj->alphabet ."]");
 693     }
 694     if ( !$cut | !$cut->isa('Bio::CodonUsage::Table'))  {
 695         $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 696                          ref($cut). "].");
 697     }
 698
 699     my $str = '';
 700     my @seq = split '', $obj->seq;
 701
 702     my $cod_ref = $cut->most_common_codons();
 703
 704     for my $aa ( @seq ) {
 705         if ($aa =~ /x/i) {
 706             $str .= 'NNN';
 707             next;
 708         }
 709         if ( defined $cod_ref->{$aa} ) {
 710             $str .= $cod_ref->{$aa};
 711         } else {
 712             $self->throw("Input sequence contains invalid character: $aa");
 713         }
 714     }
 715    return $str;
 716 }
 717
 718 =head2 is_start_codon
 719
 720  Title   : is_start_codon
 721  Usage   : $obj->is_start_codon('ATG')
 722  Function: returns true (1) for all codons that can be used as a
 723            translation start, false (0) for others.
 724  Example : $myCodonTable->is_start_codon('ATG')
 725  Returns : boolean
 726  Args    : codon
 727
 728 =cut
 729
 730 sub is_start_codon{
 731    shift->_codon_is( shift, \@STARTS, 'M' );
 732 }
 733
 734 =head2 is_ter_codon
 735
 736  Title   : is_ter_codon
 737  Usage   : $obj->is_ter_codon('GAA')
 738  Function: returns true (1) for all codons that can be used as a
 739            translation tarminator, false (0) for others.
 740  Example : $myCodonTable->is_ter_codon('ATG')
 741  Returns : boolean
 742  Args    : codon
 743
 744 =cut
 745
 746 sub is_ter_codon{
 747    my ($self, $value) = @_;
 748    my $id = $self->{'id'};
 749
 750    # We need to ensure U is mapped to T (ie. UAG)
 751    $value = uc $value;
 752    $value =~ tr/U/T/;
 753
 754    if (length $value != 3  )  {
 755        # Incomplete codons are not stop codons
 756        return 0;
 757    } else {
 758        my $result = 0;
 759
 760        # For all the possible codons, if any are not a stop
 761        # codon, fail immediately
 762        for my $c ( $self->unambiguous_codons($value) ) {
 763            my $m = substr( $TABLES[$id], $CODONS->{$c}, 1 );
 764            if($m eq $TERMINATOR) {
 765                $result = 1;
 766            } else {
 767                return 0;
 768            }
 769        }
 770        return $result;
 771    }
 772 }
 773
 774 # desc: compares the passed value with a single entry in the given
 775 #       codon table
 776 # args: a value (typically a three-char string like 'atg'),
 777 #       a reference to the appropriate set of codon tables,
 778 #       a single-character value to check for at the position in the
 779 #       given codon table
 780 # ret:  boolean, true if the given codon table contains the $key at the
 781 #       position corresponding to $value
 782 sub _codon_is {
 783    my ($self, $value, $table, $key ) = @_;
 784
 785    return 0 unless length $value == 3;
 786
 787    $value  = lc $value;
 788    $value  =~ tr/u/t/;
 789
 790    my $id = $self->{'id'};
 791    for my $c ( $self->unambiguous_codons($value) ) {
 792        my $m = substr( $table->[$id], $CODONS->{$c}, 1 );
 793        if ($m eq $key) { return 1; }
 794    }
 795    return 0;
 796 }
 797
 798 =head2 is_unknown_codon
 799
 800  Title   : is_unknown_codon
 801  Usage   : $obj->is_unknown_codon('GAJ')
 802  Function: returns false (0) for all codons that are valid,
 803         true (1) for others.
 804  Example : $myCodonTable->is_unknown_codon('NTG')
 805  Returns : boolean
 806  Args    : codon
 807
 808
 809 =cut
 810
 811 sub is_unknown_codon{
 812    my ($self, $value) = @_;
 813    $value  = lc $value;
 814    $value  =~ tr/u/t/;
 815    return 1 unless $self->unambiguous_codons($value);
 816    return 0;
 817 }
 818
 819 =head2 unambiguous_codons
 820
 821  Title   : unambiguous_codons
 822  Usage   : @codons = $self->unambiguous_codons('ACN')
 823  Returns : array of strings (one-letter unambiguous amino acid codes)
 824  Args    : a codon = a three IUPAC nucleotide character string
 825
 826 =cut
 827
 828 sub unambiguous_codons{
 829     my ($self,$value) = @_;
 830     my @nts = map { $IUPAC_DNA{uc $_} }  split(//, $value);
 831
 832     my @codons;
 833     for my $i ( @{$nts[0]} ) {
 834     for my $j ( @{$nts[1]} ) {
 835     for my $k ( @{$nts[2]} ) {
 836         push @codons, lc "$i$j$k";
 837     }}}
 838     return @codons;
 839 }
 840
 841 =head2 _unambiquous_codons
 842
 843 deprecated, now an alias for unambiguous_codons
 844
 845 =cut
 846
 847 sub _unambiquous_codons {
 848     unambiguous_codons( undef, @_ );
 849 }
 850
 851 =head2 add_table
 852
 853  Title   : add_table
 854  Usage   : $newid = $ct->add_table($name, $table, $starts)
 855  Function: Add a custom Codon Table into the object.
 856            Know what you are doing, only the length of
 857            the argument strings is checked!
 858  Returns : the id of the new codon table
 859  Args    : name, a string, optional (can be empty)
 860            table, a string of 64 characters
 861            startcodons, a string of 64 characters, defaults to standard
 862
 863 =cut
 864
 865 sub add_table {
 866     my ($self, $name, $table, $starts) = @_;
 867
 868     $name   ||= 'Custom' . $#NAMES + 1;
 869     $starts ||= $STARTS[1];
 870     $self->throw('Suspect input!')
 871         unless length($table) == 64 and length($starts) == 64;
 872
 873     push @NAMES,  $name;
 874     push @TABLES, $table;
 875     push @STARTS, $starts;
 876
 877     return $#NAMES;
 878 }
 879
 880 sub _make_iupac_string {
 881     my ($self, $cod_ref) = @_;
 882     if(ref($cod_ref) ne 'ARRAY') {
 883         $self->throw(" I need a reference to a list of references to codons, ".
 884                      " not a [". ref($cod_ref) . "].");
 885         }
 886     my %iupac_hash   = Bio::Tools::IUPAC->iupac_rev_iub();
 887     my $iupac_string = ''; ## the string to be returned
 888     for my $aa (@$cod_ref) {
 889
 890         ## scan through codon positions, record the differing values,
 891         # then look up in the iub hash
 892         for my $index(0..2) {
 893             my %h;
 894             map { my $k = substr($_,$index,1);
 895                 $h{$k}  = undef;} @$aa;
 896             my $lookup_key = join '', sort{$a cmp $b}keys %h;
 897
 898             ## extend string
 899             $iupac_string .= $iupac_hash{uc$lookup_key};
 900         }
 901     }
 902     return $iupac_string;
 903 }
 904
 905
 906 1;