lib/Bio/Tools/CodonTable.pm

   1 package Bio::Tools::CodonTable;
   2
   3 use utf8;
   4 use strict;
   5 use warnings;
   6
   7 use Bio::Tools::IUPAC;
   8 use Bio::SeqUtils;
   9
  10 use base qw(Bio::Root::Root);
  11
  12 # ABSTRACT: Codon table object
  13 # AUTHOR: Heikki Lehvaslaiho <heikki@bioperl.org>
  14 # OWNER: Heikki Lehvaslaiho <heikki@bioperl.org>
  15 # LICENSE: Perl_5
  16
  17 =head1 SYNOPSIS
  18
  19   # This is a read-only class for all known codon tables.  The IDs are
  20   # the ones used by nucleotide sequence databases.  All common IUPAC
  21   # ambiguity codes for DNA, RNA and amino acids are recognized.
  22
  23   use Bio::Tools::CodonTable;
  24
  25   # defaults to ID 1 "Standard"
  26   $myCodonTable   = Bio::Tools::CodonTable->new();
  27   $myCodonTable2  = Bio::Tools::CodonTable->new( -id => 3 );
  28
  29   # change codon table
  30   $myCodonTable->id(5);
  31
  32   # examine codon table
  33   print  join (' ', "The name of the codon table no.", $myCodonTable->id(4),
  34            "is:", $myCodonTable->name(), "\n");
  35
  36   # print possible codon tables
  37   $tables = Bio::Tools::CodonTable->tables;
  38   while ( ($id,$name) = each %{$tables} ) {
  39     print "$id = $name\n";
  40   }
  41
  42   # translate a codon
  43   $aa = $myCodonTable->translate('ACU');
  44   $aa = $myCodonTable->translate('act');
  45   $aa = $myCodonTable->translate('ytr');
  46
  47   # reverse translate an amino acid
  48   @codons = $myCodonTable->revtranslate('A');
  49   @codons = $myCodonTable->revtranslate('Ser');
  50   @codons = $myCodonTable->revtranslate('Glx');
  51   @codons = $myCodonTable->revtranslate('cYS', 'rna');
  52
  53   # reverse translate an entire amino acid sequence into a IUPAC
  54   # nucleotide string
  55
  56   my $seqobj    = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
  57   my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
  58
  59   # boolean tests
  60   print "Is a start\n"       if $myCodonTable->is_start_codon('ATG');
  61   print "Is a terminator\n"  if $myCodonTable->is_ter_codon('tar');
  62   print "Is a unknown\n"     if $myCodonTable->is_unknown_codon('JTG');
  63
  64 =head1 DESCRIPTION
  65
  66 Codon tables are also called translation tables or genetic codes
  67 since that is what they represent. A bit more complete picture
  68 of the full complexity of codon usage in various taxonomic groups
  69 is presented at the NCBI Genetic Codes Home page.
  70
  71 CodonTable is a BioPerl class that knows all current translation
  72 tables that are used by primary nucleotide sequence databases
  73 (GenBank, EMBL and DDBJ). It provides methods to output information
  74 about tables and relationships between codons and amino acids.
  75
  76 This class and its methods recognized all common IUPAC ambiguity codes
  77 for DNA, RNA and animo acids. The translation method follows the
  78 conventions in EMBL and TREMBL databases.
  79
  80 It is a nuisance to separate RNA and cDNA representations of nucleic
  81 acid transcripts. The CodonTable object accepts codons of both type as
  82 input and allows the user to set the mode for output when reverse
  83 translating. Its default for output is DNA.
  84
  85 Note:
  86
  87 This class deals primarily with individual codons and amino
  88 acids. However in the interest of speed you can L<translate>
  89 longer sequence, too. The full complexity of protein translation
  90 is tackled by L<Bio::PrimarySeqI::translate>.
  91
  92
  93 The amino acid codes are IUPAC recommendations for common amino acids:
  94
  95           A           Ala            Alanine
  96           R           Arg            Arginine
  97           N           Asn            Asparagine
  98           D           Asp            Aspartic acid
  99           C           Cys            Cysteine
 100           Q           Gln            Glutamine
 101           E           Glu            Glutamic acid
 102           G           Gly            Glycine
 103           H           His            Histidine
 104           I           Ile            Isoleucine
 105           L           Leu            Leucine
 106           K           Lys            Lysine
 107           M           Met            Methionine
 108           F           Phe            Phenylalanine
 109           P           Pro            Proline
 110           O           Pyl            Pyrrolysine (22nd amino acid)
 111           U           Sec            Selenocysteine (21st amino acid)
 112           S           Ser            Serine
 113           T           Thr            Threonine
 114           W           Trp            Tryptophan
 115           Y           Tyr            Tyrosine
 116           V           Val            Valine
 117           B           Asx            Aspartic acid or Asparagine
 118           Z           Glx            Glutamine or Glutamic acid
 119           J           Xle            Isoleucine or Valine (mass spec ambiguity)
 120           X           Xaa            Any or unknown amino acid
 121
 122
 123 It is worth noting that, "Bacterial" codon table no. 11 produces an
 124 polypeptide that is, confusingly, identical to the standard one. The
 125 only differences are in available initiator codons.
 126
 127
 128 NCBI Genetic Codes home page:
 129      (Last update of the Genetic Codes: Apr. 25, 2024)
 130      https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
 131
 132 The "value notation" / "print form" ASN.1 version is at:
 133      ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
 134
 135 Thanks to Matteo diTomasso for the original Perl implementation
 136 of these tables.
 137
 138 =cut
 139
 140
 141 # set internal values for all translation tables
 142 use constant CODONSIZE => 3;
 143 our $GAP = '-';
 144 our $CODONGAP = $GAP x CODONSIZE;
 145 our %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
 146 our %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
 147 our %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
 148 our $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
 149 our $TERMINATOR = '*';
 150
 151 our (@NAMES, @TABLES, @STARTS);
 152 # Parse the ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt file which
 153 # is below __DATA__ in this module (see the end of the file).  This
 154 # fills the @NAMES, @TABLES, and @STARTS variables.  To update to a
 155 # new release of gc.prt, replace the content below __DATA__.
 156 {
 157     # Init tables has with special option (id=0) for ATG-only start
 158     my %tables = (
 159         0 => {
 160             name => "Strict",
 161             ncbieaa => "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 162             sncbieaa => "----------**--*--------------------M----------------------------",
 163         },
 164     );
 165
 166     while (defined(my $line = <DATA>)) {
 167         next if $line =~ /^\s*--/;  # skip comment lines
 168         if ($line =~ /^\s*\{\s*$/) {  # start of a table description
 169             my $name = "";
 170             my $id = 0;
 171             my $ncbieaa = "";
 172             my $sncbieaa = "";
 173             do {
 174                 if ($line =~ /^\s*(name|id|ncbieaa|sncbieaa)\s+(.+)/) {
 175                     my $key = $1;
 176                     my $rem = $2;
 177                     if ($key eq "id") {
 178                         $rem =~ /^(\d+)/;
 179                         $id = int $1;
 180                     } else {
 181                         # The remaining keys --- name, ncbieaa, and
 182                         # sncbieaa --- are strings which may be
 183                         # multi-line (e.g., name for table with id 4).
 184                         # We are assuming that there is no " character
 185                         # inside the value so we keep appending lines
 186                         # until we find an end ".
 187                         while ($rem !~ /^"(.*)"/ && ! eof DATA) {
 188                             $rem .= <DATA>;
 189                         }
 190                         $rem =~ s/\n//g;
 191                         $rem =~ /^"(.*)"/;
 192                         my $str = $1;
 193                         if ($key eq "name" && ! $name) {
 194                             # ignore alternative names, e.g. SGC0,
 195                             # only keep the first name listed.
 196                             $name = $str;
 197                         } elsif ($key eq "ncbieaa") {
 198                             $ncbieaa = $str;
 199                         } elsif ($key eq "sncbieaa") {
 200                             $sncbieaa = $str;
 201                         }
 202                     }
 203                 }
 204             } until (($line = <DATA>) =~ /^\s*}\s*,?$/);  # we reached the end of table description
 205             $tables{$id} = {
 206                 name => $name,
 207                 ncbieaa => $ncbieaa,
 208                 sncbieaa => $sncbieaa
 209             };
 210         }
 211     }
 212     close DATA;
 213     # use Data::Dumper;
 214     # print Dumper %tables;
 215
 216     # After parsing gc.prt, fill in @NAMES, @TABLES, and @STARTS
 217     my $highest_id = (sort {$a <=> $b} keys %tables)[-1];
 218     for (my $i = 0; $i < $highest_id; $i++) {
 219         if (defined $tables{$i}) {
 220             push @NAMES, $tables{$i}->{name};
 221             push @TABLES, $tables{$i}->{ncbieaa};
 222             push @STARTS, $tables{$i}->{sncbieaa};
 223         } else {
 224             push @NAMES, '';
 225             push @TABLES, '';
 226             push @STARTS, '';
 227         }
 228     }
 229 }
 230
 231 our ($TRCOL, $CODONS);
 232 {
 233     my @nucs = qw(t c a g);
 234     my $x = 0;
 235     ($CODONS, $TRCOL) = ({}, {});
 236     for my $i (@nucs) {
 237         for my $j (@nucs) {
 238             for my $k (@nucs) {
 239                 my $codon = "$i$j$k";
 240                 $CODONS->{$codon} = $x;
 241                 $TRCOL->{$x} = $codon;
 242                 $x++;
 243             }
 244         }
 245     }
 246 }
 247
 248 sub new {
 249     my($class,@args) = @_;
 250     my $self = $class->SUPER::new(@args);
 251
 252     my($id) =
 253         $self->_rearrange([qw(ID
 254                  )],
 255              @args);
 256
 257     $id = 1 if ( ! defined ( $id ) );
 258     $id  && $self->id($id);
 259     return $self; # success - we hope!
 260 }
 261
 262 =head2 id
 263
 264  Title   : id
 265  Usage   : $obj->id(3); $id_integer = $obj->id();
 266  Function: Sets or returns the id of the translation table.  IDs are
 267            integers from 0 (special ATG-only start) to 25, excluding
 268            7-8 and 17-20 which have been removed. If an invalid ID is
 269            given the method returns 1, the standard table.
 270  Example :
 271  Returns : value of id, a scalar, warn and fall back to 1 (standard table)
 272            if specified id is not valid
 273  Args    : newvalue (optional)
 274
 275 =cut
 276
 277 sub id{
 278     my ($self,$value) = @_;
 279     if( defined $value) {
 280         if (  not defined $TABLES[$value] or $TABLES[$value] eq '') {
 281             $self->warn("Not a valid codon table ID [$value], using [1] instead ");
 282             $value = 1;
 283         }
 284         $self->{'id'} = $value;
 285     }
 286     return $self->{'id'};
 287 }
 288
 289 =head2 name
 290
 291  Title   : name
 292  Usage   : $obj->name()
 293  Function: returns the descriptive name of the translation table
 294  Example :
 295  Returns : A string
 296  Args    : None
 297
 298
 299 =cut
 300
 301 sub name{
 302    my ($self) = @_;
 303
 304    my ($id) = $self->{'id'};
 305    return $NAMES[$id];
 306 }
 307
 308 =head2 tables
 309
 310  Title   : tables
 311  Usage   : $obj->tables()  or  Bio::Tools::CodonTable->tables()
 312  Function: returns a hash reference where each key is a valid codon
 313            table id() number, and each value is the corresponding
 314            codon table name() string
 315  Example :
 316  Returns : A hashref
 317  Args    : None
 318
 319
 320 =cut
 321
 322 sub tables{
 323   my %tables;
 324   for my $id (0 .. $#NAMES) {
 325     my $name = $NAMES[$id];
 326     $tables{$id} = $name if $name;
 327   }
 328   return \%tables;
 329 }
 330
 331 =head2 translate
 332
 333  Title   : translate
 334  Usage   : $obj->translate('YTR')
 335  Function: Returns a string of one letter amino acid codes from
 336            nucleotide sequence input. The imput can be of any length.
 337
 338            Returns 'X' for unknown codons and codons that code for
 339            more than one amino acid. Returns an empty string if input
 340            is not three characters long. Exceptions for these are:
 341
 342              - IUPAC amino acid code B for Aspartic Acid and
 343                Asparagine, is used.
 344              - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
 345                used.
 346              - if the codon is two nucleotides long and if by adding
 347                an a third character 'N', it codes for a single amino
 348                acid (with exceptions above), return that, otherwise
 349                return empty string.
 350
 351            Returns empty string for other input strings that are not
 352            three characters long.
 353
 354  Example :
 355  Returns : a string of one letter ambiguous IUPAC amino acid codes
 356  Args    : ambiguous IUPAC nucleotide string
 357
 358
 359 =cut
 360
 361 sub translate {
 362     my ($self, $seq, $complete_codon) = @_;
 363     $self->throw("Calling translate without a seq argument!") unless defined $seq;
 364     return '' unless $seq;
 365
 366     my $id = $self->id;
 367     my ($partial) = 0;
 368     $partial = 2 if length($seq) % CODONSIZE == 2;
 369
 370     $seq = lc $seq;
 371     $seq =~ tr/u/t/;
 372     my $protein = "";
 373     if ($seq =~ /[^actg]/ ) { #ambiguous chars
 374         for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) {
 375             my $triplet = substr($seq, $i, CODONSIZE);
 376             if( $triplet eq $CODONGAP ) {
 377                 $protein .= $GAP;
 378             } elsif (exists $CODONS->{$triplet}) {
 379                 $protein .= substr($TABLES[$id],
 380                            $CODONS->{$triplet},1);
 381             } else {
 382                 $protein .= $self->_translate_ambiguous_codon($triplet);
 383             }
 384         }
 385     } else { # simple, strict translation
 386         for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) {
 387             my $triplet = substr($seq, $i, CODONSIZE);
 388             if( $triplet eq $CODONGAP ) {
 389                 $protein .= $GAP;
 390             }
 391             if (exists $CODONS->{$triplet}) {
 392                 $protein .= substr($TABLES[$id], $CODONS->{$triplet}, 1);
 393             } else {
 394                 $protein .= 'X';
 395             }
 396         }
 397     }
 398     if ($partial == 2 && $complete_codon) { # 2 overhanging nucleotides
 399         my $triplet = substr($seq, ($partial -4)). "n";
 400         if( $triplet eq $CODONGAP ) {
 401             $protein .= $GAP;
 402         } elsif (exists $CODONS->{$triplet}) {
 403             my $aa = substr($TABLES[$id], $CODONS->{$triplet},1);
 404             $protein .= $aa;
 405         } else {
 406             $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
 407         }
 408     }
 409     return $protein;
 410 }
 411
 412 sub _translate_ambiguous_codon {
 413     my ($self, $triplet, $partial) = @_;
 414     $partial ||= 0;
 415     my $id = $self->id;
 416     my $aa;
 417     my @codons = $self->unambiguous_codons($triplet);
 418     my %aas =();
 419     foreach my $codon (@codons) {
 420         $aas{substr($TABLES[$id],$CODONS->{$codon},1)} = 1;
 421     }
 422     my $count = scalar keys %aas;
 423     if ( $count == 1 ) {
 424         $aa = (keys %aas)[0];
 425     }
 426     elsif ( $count == 2 ) {
 427         if ($aas{'D'} and $aas{'N'}) {
 428             $aa = 'B';
 429         }
 430         elsif ($aas{'E'} and $aas{'Q'}) {
 431             $aa = 'Z';
 432         } else {
 433             $partial ? ($aa = '') : ($aa = 'X');
 434         }
 435     } else {
 436         $partial ? ($aa = '') :  ($aa = 'X');
 437     }
 438     return $aa;
 439 }
 440
 441 =head2 translate_strict
 442
 443  Title   : translate_strict
 444  Usage   : $obj->translate_strict('ACT')
 445  Function: returns one letter amino acid code for a codon input
 446
 447            Fast and simple translation. User is responsible to resolve
 448            ambiguous nucleotide codes before calling this
 449            method. Returns 'X' for unknown codons and an empty string
 450            for input strings that are not three characters long.
 451
 452            It is not recommended to use this method in a production
 453            environment. Use method translate, instead.
 454
 455  Example :
 456  Returns : A string
 457  Args    : a codon = a three nucleotide character string
 458
 459
 460 =cut
 461
 462 sub translate_strict{
 463    my ($self, $value) = @_;
 464    my $id = $self->{'id'};
 465
 466    $value  = lc $value;
 467    $value  =~ tr/u/t/;
 468
 469    return '' unless length $value == 3;
 470
 471    return 'X' unless defined $CODONS->{$value};
 472
 473    return substr( $TABLES[$id], $CODONS->{$value}, 1 );
 474 }
 475
 476 =head2 revtranslate
 477
 478  Title   : revtranslate
 479  Usage   : $obj->revtranslate('G')
 480  Function: returns codons for an amino acid
 481
 482            Returns an empty string for unknown amino acid
 483            codes. Ambiguous IUPAC codes Asx,B, (Asp,D; Asn,N) and
 484            Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
 485            letter amino acid codes are accepted. '*' and 'Ter' are
 486            used for terminator.
 487
 488            By default, the output codons are shown in DNA.  If the
 489            output is needed in RNA (tr/t/u/), add a second argument
 490            'RNA'.
 491
 492  Example : $obj->revtranslate('Gly', 'RNA')
 493  Returns : An array of three lower case letter strings i.e. codons
 494  Args    : amino acid, 'RNA'
 495
 496 =cut
 497
 498 sub revtranslate {
 499     my ($self, $value, $coding) = @_;
 500     my @codons;
 501
 502     if (length($value) == 3 ) {
 503         $value = lc $value;
 504         $value = ucfirst $value;
 505         $value = $THREELETTERSYMBOLS{$value};
 506     }
 507     if (    defined $value and $value =~ /$VALID_PROTEIN/
 508         and length($value) == 1
 509         ) {
 510         my $id = $self->{'id'};
 511
 512         $value = uc $value;
 513         my @aas = @{$IUPAC_AA{$value}};
 514         foreach my $aa (@aas) {
 515             #print $aa, " -2\n";
 516             $aa = '\*' if $aa eq '*';
 517             while ($TABLES[$id] =~ m/$aa/g) {
 518                 my $p = pos $TABLES[$id];
 519                 push (@codons, $TRCOL->{--$p});
 520             }
 521         }
 522     }
 523
 524     if ($coding and uc ($coding) eq 'RNA') {
 525         for my $i (0..$#codons)  {
 526             $codons[$i] =~ tr/t/u/;
 527         }
 528     }
 529
 530    return @codons;
 531 }
 532
 533 =head2 reverse_translate_all
 534
 535  Title   : reverse_translate_all
 536  Usage   : my $iup_str = $cttable->reverse_translate_all($seq_object)
 537            my $iup_str = $cttable->reverse_translate_all($seq_object,
 538                                                          $cutable,
 539                                                          15);
 540  Function: reverse translates a protein sequence into IUPAC nucleotide
 541            sequence. An 'X' in the protein sequence is converted to 'NNN'
 542            in the nucleotide sequence.
 543  Returns : a string
 544  Args    : a Bio::PrimarySeqI compatible object (mandatory)
 545            a Bio::CodonUsage::Table object and a threshold if only
 546              codons with a relative frequency above the threshold are
 547              to be considered.
 548 =cut
 549
 550 sub reverse_translate_all {
 551     my ($self, $obj, $cut, $threshold) = @_;
 552
 553     ## check args are OK
 554
 555     if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 556         $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 557                         ref($obj) . "]");
 558         }
 559     if($obj->alphabet ne 'protein') {
 560         $self->throw("Cannot reverse translate, need an amino acid sequence .".
 561                      "This sequence is of type [" . $obj->alphabet ."]");
 562         }
 563     my @data;
 564     my @seq = split '', $obj->seq;
 565
 566     ## if we're not supplying a codon usage table...
 567     if( !$cut && !$threshold) {
 568         ## get lists of possible codons for each aa.
 569         for my $aa (@seq) {
 570             if ($aa =~ /x/i) {
 571                 push @data, (['NNN']);
 572             }else {
 573                 my @cods = $self->revtranslate($aa);
 574                 push @data, \@cods;
 575             }
 576         }
 577     }else{
 578     #else we are supplying a codon usage table, we just want common codons
 579     #check args first.
 580         if(!$cut->isa('Bio::CodonUsage::Table'))    {
 581             $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 582                      ref($cut). "].");
 583             }
 584         my $cod_ref = $cut->probable_codons($threshold);
 585         for my $aa (@seq) {
 586             if ($aa =~ /x/i) {
 587                 push @data, (['NNN']);
 588                 next;
 589                 }
 590             push @data, $cod_ref->{$aa};
 591         }
 592     }
 593
 594     return $self->_make_iupac_string(\@data);
 595 }
 596
 597 =head2 reverse_translate_best
 598
 599  Title   : reverse_translate_best
 600  Usage   : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
 601  Function: Reverse translates a protein sequence into plain nucleotide
 602            sequence (GATC), uses the most common codon for each amino acid
 603  Returns : A string
 604  Args    : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
 605
 606 =cut
 607
 608 sub reverse_translate_best {
 609
 610     my ($self, $obj, $cut) = @_;
 611
 612     if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 613         $self->throw(" I need a Bio::PrimarySeqI object, not a [".
 614                          ref($obj) . "]");
 615     }
 616     if ($obj->alphabet ne 'protein')    {
 617         $self->throw("Cannot reverse translate, need an amino acid sequence .".
 618                          "This sequence is of type [" . $obj->alphabet ."]");
 619     }
 620     if ( !$cut | !$cut->isa('Bio::CodonUsage::Table'))  {
 621         $self->throw("I need a Bio::CodonUsage::Table object, not a [".
 622                          ref($cut). "].");
 623     }
 624
 625     my $str = '';
 626     my @seq = split '', $obj->seq;
 627
 628     my $cod_ref = $cut->most_common_codons();
 629
 630     for my $aa ( @seq ) {
 631         if ($aa =~ /x/i) {
 632             $str .= 'NNN';
 633             next;
 634         }
 635         if ( defined $cod_ref->{$aa} ) {
 636             $str .= $cod_ref->{$aa};
 637         } else {
 638             $self->throw("Input sequence contains invalid character: $aa");
 639         }
 640     }
 641    return $str;
 642 }
 643
 644 =head2 is_start_codon
 645
 646  Title   : is_start_codon
 647  Usage   : $obj->is_start_codon('ATG')
 648  Function: returns true (1) for all codons that can be used as a
 649            translation start, false (0) for others.
 650  Example : $myCodonTable->is_start_codon('ATG')
 651  Returns : boolean
 652  Args    : codon
 653
 654 =cut
 655
 656 sub is_start_codon{
 657    shift->_codon_is( shift, \@STARTS, 'M' );
 658 }
 659
 660 =head2 is_ter_codon
 661
 662  Title   : is_ter_codon
 663  Usage   : $obj->is_ter_codon('GAA')
 664  Function: returns true (1) for all codons that can be used as a
 665            translation tarminator, false (0) for others.
 666  Example : $myCodonTable->is_ter_codon('ATG')
 667  Returns : boolean
 668  Args    : codon
 669
 670 =cut
 671
 672 sub is_ter_codon{
 673    my ($self, $value) = @_;
 674    my $id = $self->{'id'};
 675
 676    # We need to ensure U is mapped to T (ie. UAG)
 677    $value = uc $value;
 678    $value =~ tr/U/T/;
 679
 680    if (length $value != 3  )  {
 681        # Incomplete codons are not stop codons
 682        return 0;
 683    } else {
 684        my $result = 0;
 685
 686        # For all the possible codons, if any are not a stop
 687        # codon, fail immediately
 688        for my $c ( $self->unambiguous_codons($value) ) {
 689            my $m = substr( $TABLES[$id], $CODONS->{$c}, 1 );
 690            if($m eq $TERMINATOR) {
 691                $result = 1;
 692            } else {
 693                return 0;
 694            }
 695        }
 696        return $result;
 697    }
 698 }
 699
 700 # desc: compares the passed value with a single entry in the given
 701 #       codon table
 702 # args: a value (typically a three-char string like 'atg'),
 703 #       a reference to the appropriate set of codon tables,
 704 #       a single-character value to check for at the position in the
 705 #       given codon table
 706 # ret:  boolean, true if the given codon table contains the $key at the
 707 #       position corresponding to $value
 708 sub _codon_is {
 709    my ($self, $value, $table, $key ) = @_;
 710
 711    return 0 unless length $value == 3;
 712
 713    $value  = lc $value;
 714    $value  =~ tr/u/t/;
 715
 716    my $id = $self->{'id'};
 717    for my $c ( $self->unambiguous_codons($value) ) {
 718        my $m = substr( $table->[$id], $CODONS->{$c}, 1 );
 719        if ($m eq $key) { return 1; }
 720    }
 721    return 0;
 722 }
 723
 724 =head2 is_unknown_codon
 725
 726  Title   : is_unknown_codon
 727  Usage   : $obj->is_unknown_codon('GAJ')
 728  Function: returns false (0) for all codons that are valid,
 729         true (1) for others.
 730  Example : $myCodonTable->is_unknown_codon('NTG')
 731  Returns : boolean
 732  Args    : codon
 733
 734
 735 =cut
 736
 737 sub is_unknown_codon{
 738    my ($self, $value) = @_;
 739    $value  = lc $value;
 740    $value  =~ tr/u/t/;
 741    return 1 unless $self->unambiguous_codons($value);
 742    return 0;
 743 }
 744
 745 =head2 unambiguous_codons
 746
 747  Title   : unambiguous_codons
 748  Usage   : @codons = $self->unambiguous_codons('ACN')
 749  Returns : array of strings (one-letter unambiguous amino acid codes)
 750  Args    : a codon = a three IUPAC nucleotide character string
 751
 752 =cut
 753
 754 sub unambiguous_codons{
 755     my ($self,$value) = @_;
 756     my @nts = map { $IUPAC_DNA{uc $_} }  split(//, $value);
 757
 758     my @codons;
 759     for my $i ( @{$nts[0]} ) {
 760     for my $j ( @{$nts[1]} ) {
 761     for my $k ( @{$nts[2]} ) {
 762         push @codons, lc "$i$j$k";
 763     }}}
 764     return @codons;
 765 }
 766
 767 =head2 _unambiquous_codons
 768
 769 deprecated, now an alias for unambiguous_codons
 770
 771 =cut
 772
 773 sub _unambiquous_codons {
 774     unambiguous_codons( undef, @_ );
 775 }
 776
 777 =head2 add_table
 778
 779  Title   : add_table
 780  Usage   : $newid = $ct->add_table($name, $table, $starts)
 781  Function: Add a custom Codon Table into the object.
 782            Know what you are doing, only the length of
 783            the argument strings is checked!
 784  Returns : the id of the new codon table
 785  Args    : name, a string, optional (can be empty)
 786            table, a string of 64 characters
 787            startcodons, a string of 64 characters, defaults to standard
 788
 789 =cut
 790
 791 sub add_table {
 792     my ($self, $name, $table, $starts) = @_;
 793
 794     $name   ||= 'Custom' . $#NAMES + 1;
 795     $starts ||= $STARTS[1];
 796     $self->throw('Suspect input!')
 797         unless length($table) == 64 and length($starts) == 64;
 798
 799     push @NAMES,  $name;
 800     push @TABLES, $table;
 801     push @STARTS, $starts;
 802
 803     return $#NAMES;
 804 }
 805
 806 sub _make_iupac_string {
 807     my ($self, $cod_ref) = @_;
 808     if(ref($cod_ref) ne 'ARRAY') {
 809         $self->throw(" I need a reference to a list of references to codons, ".
 810                      " not a [". ref($cod_ref) . "].");
 811         }
 812     my %iupac_hash   = Bio::Tools::IUPAC->iupac_rev_iub();
 813     my $iupac_string = ''; ## the string to be returned
 814     for my $aa (@$cod_ref) {
 815
 816         ## scan through codon positions, record the differing values,
 817         # then look up in the iub hash
 818         for my $index(0..2) {
 819             my %h;
 820             map { my $k = substr($_,$index,1);
 821                 $h{$k}  = undef;} @$aa;
 822             my $lookup_key = join '', sort{$a cmp $b}keys %h;
 823
 824             ## extend string
 825             $iupac_string .= $iupac_hash{uc$lookup_key};
 826         }
 827     }
 828     return $iupac_string;
 829 }
 830
 831
 832 1;
 833
 834 # Follows the content of
 835 # ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt, which is the NCBI
 836 # genetic codon table in ASN.1 value notation / print format.  We do
 837 # not have a ASN.1 decoder for value notation but it's easy enough to
 838 # parse.
 839
 840 __DATA__
 841 --**************************************************************************
 842 --  This is the NCBI genetic code table
 843 --  Initial base data set from Andrzej Elzanowski while at PIR International
 844 --  Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI
 845 --  Base 1-3 of each codon have been added as comments to facilitate
 846 --    readability at the suggestion of Peter Rice, EMBL
 847 --  Later additions by Taxonomy Group staff at NCBI
 848 --
 849 --  Version 4.6
 850 --     Renamed genetic code 24 to Rhabdopleuridae Mitochondrial
 851 --
 852 --  Version 4.5
 853 --     Added Cephalodiscidae mitochondrial genetic code 33
 854 --
 855 --  Version 4.4
 856 --     Added GTG as start codon for genetic code 3
 857 --     Added Balanophoraceae plastid genetic code 32
 858 --
 859 --  Version 4.3
 860 --     Change to CTG -> Leu in genetic codes 27, 28, 29, 30
 861 --
 862 --  Version 4.2
 863 --     Added Karyorelict nuclear genetic code 27
 864 --     Added Condylostoma nuclear genetic code 28
 865 --     Added Mesodinium nuclear genetic code 29
 866 --     Added Peritrich nuclear genetic code 30
 867 --     Added Blastocrithidia nuclear genetic code 31
 868 --
 869 --  Version 4.1
 870 --     Added Pachysolen tannophilus nuclear genetic code 26
 871 --
 872 --  Version 4.0
 873 --     Updated version to reflect numerous undocumented changes:
 874 --     Corrected start codons for genetic code 25
 875 --     Name of new genetic code is Candidate Division SR1 and Gracilibacteria
 876 --     Added candidate division SR1 nuclear genetic code 25
 877 --     Added GTG as start codon for genetic code 24
 878 --     Corrected Pterobranchia Mitochondrial genetic code (24)
 879 --     Added genetic code 24, Pterobranchia Mitochondrial
 880 --     Genetic code 11 is now Bacterial, Archaeal and Plant Plastid
 881 --     Fixed capitalization of mitochondrial in codes 22 and 23
 882 --     Added GTG, ATA, and TTG as alternative start codons to code 13
 883 --
 884 --  Version 3.9
 885 --     Code 14 differs from code 9 only by translating UAA to Tyr rather than
 886 --     STOP.  A recent study (Telford et al, 2000) has found no evidence that
 887 --     the codon UAA codes for Tyr in the flatworms, but other opinions exist.
 888 --     There are very few GenBank records that are translated with code 14,
 889 --     but a test translation shows that retranslating these records with code
 890 --     9 can cause premature terminations.  Therefore, GenBank will maintain
 891 --     code 14 until further information becomes available.
 892 --
 893 --  Version 3.8
 894 --     Added GTG start to Echinoderm mitochondrial code, code 9
 895 --
 896 --  Version 3.7
 897 --     Added code 23 Thraustochytrium mitochondrial code
 898 --        formerly OGMP code 93
 899 --        submitted by Gertraude Berger, Ph.D.
 900 --
 901 --  Version 3.6
 902 --     Added code 22 TAG-Leu, TCA-stop
 903 --        found in mitochondrial DNA of Scenedesmus obliquus
 904 --        submitted by Gertraude Berger, Ph.D.
 905 --        Organelle Genome Megasequencing Program, Univ Montreal
 906 --
 907 --  Version 3.5
 908 --     Added code 21, Trematode Mitochondrial
 909 --       (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990)
 910 --     Added code 16, Chlorophycean Mitochondrial
 911 --       (TAG can translated to Leucine instaed to STOP in chlorophyceans
 912 --        and fungi)
 913 --
 914 --  Version 3.4
 915 --     Added CTG,TTG as allowed alternate start codons in Standard code.
 916 --        Prats et al. 1989, Hann et al. 1992
 917 --
 918 --  Version 3.3 - 10/13/95
 919 --     Added alternate intiation codon ATC to code 5
 920 --        based on complete mitochondrial genome of honeybee
 921 --        Crozier and Crozier (1993)
 922 --
 923 --  Version 3.2 - 6/24/95
 924 --  Code       Comments
 925 --   10        Alternative Ciliate Macronuclear renamed to Euplotid Macro...
 926 --   15        Blepharisma Macro.. code added
 927 --    5        Invertebrate Mito.. GTG allowed as alternate initiator
 928 --   11        Eubacterial renamed to Bacterial as most alternate starts
 929 --               have been found in Archea
 930 --
 931 --
 932 --  Version 3.1 - 1995
 933 --  Updated as per Andrzej Elzanowski at NCBI
 934 --     Complete documentation in NCBI toolkit documentation
 935 --  Note: 2 genetic codes have been deleted
 936 --
 937 --   Old id   Use id     - Notes
 938 --
 939 --   id 7      id 4      - Kinetoplast code now merged in code id 4
 940 --   id 8      id 1      - all plant chloroplast differences due to RNA edit
 941 --
 942 --
 943 --*************************************************************************
 944
 945 Genetic-code-table ::= {
 946  {
 947   name "Standard" ,
 948   name "SGC0" ,
 949   id 1 ,
 950   ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 951   sncbieaa "---M------**--*----M---------------M----------------------------"
 952   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 953   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 954   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 955  },
 956  {
 957   name "Vertebrate Mitochondrial" ,
 958   name "SGC1" ,
 959   id 2 ,
 960   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
 961   sncbieaa "----------**--------------------MMMM----------**---M------------"
 962   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 963   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 964   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 965  },
 966  {
 967   name "Yeast Mitochondrial" ,
 968   name "SGC2" ,
 969   id 3 ,
 970   ncbieaa  "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 971   sncbieaa "----------**----------------------MM---------------M------------"
 972   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 973   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 974   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 975  },
 976  {
 977     name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate
 978  Mitochondrial; Mycoplasma; Spiroplasma" ,
 979   name "SGC3" ,
 980   id 4 ,
 981   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 982   sncbieaa "--MM------**-------M------------MMMM---------------M------------"
 983   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 984   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 985   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 986  },
 987  {
 988   name "Invertebrate Mitochondrial" ,
 989   name "SGC4" ,
 990   id 5 ,
 991   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
 992   sncbieaa "---M------**--------------------MMMM---------------M------------"
 993   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 994   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 995   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 996  },
 997  {
 998   name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" ,
 999   name "SGC5" ,
1000   id 6 ,
1001   ncbieaa  "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1002   sncbieaa "--------------*--------------------M----------------------------"
1003   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1004   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1005   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1006  },
1007  {
1008   name "Echinoderm Mitochondrial; Flatworm Mitochondrial" ,
1009   name "SGC8" ,
1010   id 9 ,
1011   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
1012   sncbieaa "----------**-----------------------M---------------M------------"
1013   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1014   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1015   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1016  },
1017  {
1018   name "Euplotid Nuclear" ,
1019   name "SGC9" ,
1020   id 10 ,
1021   ncbieaa  "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1022   sncbieaa "----------**-----------------------M----------------------------"
1023   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1024   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1025   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1026  },
1027  {
1028   name "Bacterial, Archaeal and Plant Plastid" ,
1029   id 11 ,
1030   ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1031   sncbieaa "---M------**--*----M------------MMMM---------------M------------"
1032   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1033   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1034   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1035  },
1036  {
1037   name "Alternative Yeast Nuclear" ,
1038   id 12 ,
1039   ncbieaa  "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1040   sncbieaa "----------**--*----M---------------M----------------------------"
1041   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1042   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1043   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1044  },
1045  {
1046   name "Ascidian Mitochondrial" ,
1047   id 13 ,
1048   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
1049   sncbieaa "---M------**----------------------MM---------------M------------"
1050   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1051   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1052   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1053  },
1054  {
1055   name "Alternative Flatworm Mitochondrial" ,
1056   id 14 ,
1057   ncbieaa  "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
1058   sncbieaa "-----------*-----------------------M----------------------------"
1059   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1060   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1061   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1062  } ,
1063  {
1064   name "Blepharisma Macronuclear" ,
1065   id 15 ,
1066   ncbieaa  "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1067   sncbieaa "----------*---*--------------------M----------------------------"
1068   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1069   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1070   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1071  } ,
1072  {
1073   name "Chlorophycean Mitochondrial" ,
1074   id 16 ,
1075   ncbieaa  "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1076   sncbieaa "----------*---*--------------------M----------------------------"
1077   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1078   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1079   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1080  } ,
1081  {
1082   name "Trematode Mitochondrial" ,
1083   id 21 ,
1084   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
1085   sncbieaa "----------**-----------------------M---------------M------------"
1086   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1087   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1088   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1089  } ,
1090  {
1091   name "Scenedesmus obliquus Mitochondrial" ,
1092   id 22 ,
1093   ncbieaa  "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1094   sncbieaa "------*---*---*--------------------M----------------------------"
1095   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1096   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1097   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1098  } ,
1099  {
1100   name "Thraustochytrium Mitochondrial" ,
1101   id 23 ,
1102   ncbieaa  "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1103   sncbieaa "--*-------**--*-----------------M--M---------------M------------"
1104   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1105   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1106   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1107  } ,
1108  {
1109   name "Rhabdopleuridae Mitochondrial" ,
1110   id 24 ,
1111   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG",
1112   sncbieaa "---M------**-------M---------------M---------------M------------"
1113   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1114   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1115   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1116  } ,
1117  {
1118   name "Candidate Division SR1 and Gracilibacteria" ,
1119   id 25 ,
1120   ncbieaa  "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1121   sncbieaa "---M------**-----------------------M---------------M------------"
1122   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1123   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1124   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1125  } ,
1126  {
1127   name "Pachysolen tannophilus Nuclear" ,
1128   id 26 ,
1129   ncbieaa  "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1130   sncbieaa "----------**--*----M---------------M----------------------------"
1131   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1132   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1133   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1134  } ,
1135  {
1136   name "Karyorelict Nuclear" ,
1137   id 27 ,
1138   ncbieaa  "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1139   sncbieaa "--------------*--------------------M----------------------------"
1140   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1141   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1142   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1143  } ,
1144  {
1145   name "Condylostoma Nuclear" ,
1146   id 28 ,
1147   ncbieaa  "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1148   sncbieaa "----------**--*--------------------M----------------------------"
1149   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1150   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1151   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1152  } ,
1153  {
1154   name "Mesodinium Nuclear" ,
1155   id 29 ,
1156   ncbieaa  "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1157   sncbieaa "--------------*--------------------M----------------------------"
1158   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1159   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1160   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1161  } ,
1162  {
1163   name "Peritrich Nuclear" ,
1164   id 30 ,
1165   ncbieaa  "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1166   sncbieaa "--------------*--------------------M----------------------------"
1167   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1168   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1169   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1170  } ,
1171  {
1172   name "Blastocrithidia Nuclear" ,
1173   id 31 ,
1174   ncbieaa  "FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1175   sncbieaa "----------**-----------------------M----------------------------"
1176   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1177   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1178   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1179  } ,
1180  {
1181   name "Balanophoraceae Plastid" ,
1182   id 32 ,
1183   ncbieaa  "FFLLSSSSYY*WCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
1184   sncbieaa "---M------*---*----M------------MMMM---------------M------------"
1185   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1186   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1187   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1188  } ,
1189  {
1190   name "Cephalodiscidae Mitochondrial" ,
1191   id 33 ,
1192   ncbieaa  "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG",
1193   sncbieaa "---M-------*-------M---------------M---------------M------------"
1194   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
1195   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
1196   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
1197  }
1198 }