2 # BioPerl module for Bio::CodonUsage::Table
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Richard Adams (richard.adams@ed.ac.uk)
8 # Copyright Richard Adams
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::CodonUsage::Table - for access to the Codon usage Database
17 at http://www.kazusa.or.jp/codon.
21 use Bio::CodonUsage::Table;
23 use Bio::CodonUsage::IO;
24 use Bio::Tools::SeqStats;
26 # Get a codon usage table from web database
27 my $cdtable = Bio::DB::CUTG->new(-sp => 'Mus musculus',
31 my $io = Bio::CodonUsage::IO->new(-file => "file");
32 my $cdtable = $io->next_data();
34 # Or create your own from a Bio::PrimarySeq compliant object,
35 # $codonstats is a ref to a hash of codon name /count key-value pairs
36 my $codonstats = Bio::Tools::SeqStats->count_codons($Seq_objct);
38 # '-data' must be specified, '-species' and 'genetic_code' are optional
39 my $CUT = Bio::CodonUsage::Table->new(-data => $codonstats,
40 -species => 'Hsapiens_kinase');
42 print "leu frequency is ", $cdtable->aa_frequency('LEU'), "\n";
43 print "freq of ATG is ", $cdtable->codon_rel_frequency('ttc'), "\n";
44 print "abs freq of ATG is ", $cdtable->codon_abs_frequency('ATG'), "\n";
45 print "number of ATG codons is ", $cdtable->codon_count('ATG'), "\n";
46 print "GC content at position 1 is ", $cdtable->get_coding_gc('1'), "\n";
47 print "total CDSs for Mus musculus is ", $cdtable->cds_count(), "\n";
52 This class provides methods for accessing codon usage table data.
54 All of the methods at present are simple look-ups of the table or are
55 derived from simple calculations from the table. Future methods could
56 include measuring the codon usage of a sequence , for example, or
57 provide methods for examining codon usage in alignments.
61 L<Bio::Tools::CodonTable>,
63 L<Bio::CodonUsage::IO>,
71 User feedback is an integral part of the evolution of this and other
72 Bioperl modules. Send your comments and suggestions preferably to one
73 of the Bioperl mailing lists. Your participation is much appreciated.
75 bioperl-l@bioperl.org - General discussion
76 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
80 Please direct usage questions or support issues to the mailing list:
82 I<bioperl-l@bioperl.org>
84 rather than to the module maintainer directly. Many experienced and
85 reponsive experts will be able look at the problem and quickly
86 address it. Please include a thorough description of the problem
87 with code and data examples if at all possible.
91 Report bugs to the Bioperl bug tracking system to help us keep track
92 the bugs and their resolution. Bug reports can be submitted via the
95 https://github.com/bioperl/bioperl-live/issues
99 Richard Adams, Richard.Adams@ed.ac.uk
103 The rest of the documentation details each of the object
104 methods. Internal methods are usually preceded with a _
109 # Let the code begin...
111 package Bio
::CodonUsage
::Table
;
113 use vars
qw(%STRICTAA @AA);
115 use Bio::Tools::CodonTable;
117 use base qw(Bio::Root::Root);
120 @AA = qw(A C D E F G H I K L M N P Q R S T V W Y *);
121 map {$STRICTAA{$_} = undef} @AA;
127 Usage : my $cut = Bio::CodonUsage::Table->new(-data => $cut_hash_ref,
128 -species => 'H.sapiens_kinase'
130 Returns : a reference to a new Bio::CodonUsage::Table object
131 Args : none or a reference to a hash of codon counts. This constructor is
132 designed to be compatible with the output of
133 Bio::Tools::SeqUtils::count_codons()
134 Species and genetic code parameters can be entered here or via the
135 species() and genetic_code() methods separately.
140 my ($class, @args) = @_;
141 my $self= $class->SUPER::new
(@args);
143 $self->_rearrange([qw(DATA)], @args);
144 shift @args; # get rid of key
145 my $arg = shift @args;
146 $self->throw("need a hash reference, not a [" . ref($arg). "] reference") if ref($arg) ne 'HASH';
147 ### flags to detect argument type, can be either to start with ##
148 my $is_codon_hash = 1;
150 for my $k (keys %$arg) {
153 if (!exists($STRICTAA{$k}) ){
156 elsif ($k =~ /[^ATCGatcg]/) {
160 if (!$is_codon_hash && !$is_Aa_hash) {
161 $self->throw(" invalid key values in CUT hash - must be unique aa or nucleotide identifiers");
163 elsif ($is_Aa_hash) {
164 $self->_init_from_aa($arg);
166 elsif($is_codon_hash) {
167 $self->_init_from_cod($arg);
170 my $key = shift @args;
171 $key =~ s/\-(\w+)/\L$1/;
173 $self->$key(shift @args);
180 =head2 all_aa_frequencies
182 Title : all_aa_frequencies
183 Usage : my $freq = $cdtable->all_aa_frequencies();
184 Returns : a reference to a hash where each key is an amino acid
185 and each value is its frequency in all proteins in that
191 sub all_aa_frequencies
{
194 for my $aa (keys %STRICTAA) {
195 my $freq = $self->aa_frequency($aa);
196 $aa_freqs{$aa} = $freq;
201 =head2 codon_abs_frequency
203 Title : codon_abs_frequency
204 Usage : my $freq = $cdtable->codon_abs_frequency('CTG');
205 Purpose : To return the frequency of that codon as a percentage
206 of all codons in the organism.
207 Returns : a percentage frequency
208 Args : a non-ambiguous codon string
212 sub codon_abs_frequency
{
215 if ($self->_check_codon($cod)) {
216 my $ctable = Bio
::Tools
::CodonTable
->new;
217 $ctable->id($self->genetic_code() );
218 my $aa =$Bio::SeqUtils
::THREECODE
{$ctable->translate($cod)};
220 return $self->{'_table'}{$aa}{$cod}{'per1000'}/10 ;
225 =head2 codon_rel_frequency
227 Title : codon_rel_frequency
228 Usage : my $freq = $cdtable->codon_rel_frequency('CTG');
229 Purpose : To return the frequency of that codon as a percentage
230 of codons coding for the same amino acid. E.g., ATG and TGG
231 would return 100 as those codons are unique.
232 Returns : a percentage frequency
233 Args : a non-ambiguous codon string
238 sub codon_rel_frequency
{
241 if ($self->_check_codon($cod)) {
242 my $ctable = Bio
::Tools
::CodonTable
->new;
243 $ctable->id($self->genetic_code () );
244 my $aa =$Bio::SeqUtils
::THREECODE
{$ctable->translate($cod)};
245 return $self->{'_table'}{$aa}{$cod}{'rel_freq'};
252 =head2 probable_codons
254 Title : probable_codons
255 Usage : my $prob_codons = $cd_table->probable_codons(10);
256 Purpose : to obtain a list of codons for the amino acid above a given
257 threshold % relative frequency
258 Returns : A reference to a hash where keys are 1 letter amino acid codes
259 and values are references to arrays of codons whose frequency
260 is above the threshold.
261 Arguments: a minimum threshold frequency
265 sub probable_codons
{
266 my ($self, $threshold) = @_;
267 if (!$threshold || $threshold < 0 || $threshold > 100) {
268 $self->throw(" I need a threshold percentage ");
271 for my $a(keys %STRICTAA) {
273 my $aa =$Bio::SeqUtils
::THREECODE
{$a};
274 for my $codon (keys %{ $self->{'_table'}{$aa}}) {
275 if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $threshold/100){
276 push @common_codons, $codon;
279 $return_hash{$a} = \
@common_codons;
281 ## check to make sure that all codons are populated ##
282 if (grep{scalar @
{$return_hash{$_}} == 0} keys %return_hash) {
283 $self->warn("Threshold is too high, some amino acids do not" .
284 " have any codon above the threshold!");
286 return \
%return_hash;
290 =head2 most_common_codons
292 Title : most_common_codons
293 Usage : my $common_codons = $cd_table->most_common_codons();
294 Purpose : To obtain the most common codon for a given amino acid
295 Returns : A reference to a hash where keys are 1 letter amino acid codes
296 and the values are the single most common codons for those amino acids
301 sub most_common_codons
{
306 for my $a ( keys %STRICTAA ) {
308 my $common_codon = '';
310 my $aa = $Bio::SeqUtils
::THREECODE
{$a};
312 if ( ! defined $self->{'_table'}{$aa} ) {
313 $self->warn("Amino acid $aa ($a) does not have any codons!");
317 for my $codon ( keys %{ $self->{'_table'}{$aa}} ) {
318 if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $rel_freq ){
319 $common_codon = $codon;
320 $rel_freq = $self->{'_table'}{$aa}{$codon}{'rel_freq'};
323 $return_hash{$a} = $common_codon;
326 return \
%return_hash;
332 Usage : my $count = $cdtable->codon_count('CTG');
333 Purpose : To obtain the absolute number of the codons in the
336 Args : a non-ambiguous codon string
345 if ($self->_check_codon($cod)) {
346 my $ctable = Bio
::Tools
::CodonTable
->new;
347 $ctable->id($self->genetic_code());
348 my $aa =$Bio::SeqUtils
::THREECODE
{$ctable->translate($cod)};
349 return $self->{'_table'}{$aa}{$cod}{'abs_count'};
354 $self->warn(" need to give a codon sequence as a parameter ");
362 Title : get_coding_gc
363 Usage : my $count = $cdtable->get_coding_gc(1);
364 Purpose : To return the percentage GC composition for the organism at
365 codon positions 1,2 or 3, or an average for all coding sequence
367 Returns : a number (%-age GC content) or 0 if these fields are undefined
368 Args : 1,2,3 or 'all'.
375 $self->warn(" no parameters supplied must be a codon position (1,2,3) or 'all'");
380 ##return request if valid ##
381 if ( exists($self->{'_coding_gc'}{$n} ) ) {
382 return sprintf("%.2f", $self->{'_coding_gc'}{$n});
384 ##else return 'all' value if exists
385 elsif (exists($self->{'_coding_gc'}{'all'} )) {
386 $self->warn("coding gc doesn't have value for [$n], returning gc content for all CDSs");
387 return sprintf("%.2f", $self->{'_coding_gc'}{'all'});
391 $self->warn("coding gc values aren't defined, returning 0");
401 Title : set_coding_gc
402 Usage : my $count = $cdtable->set_coding_gc(-1=>55.78);
403 Purpose : To set the percentage GC composition for the organism at
404 codon positions 1,2 or 3, or an average for all coding sequence
407 Args : a hash where the key must be 1,2,3 or 'all' and the value the %age GC
408 at that codon position..
413 my ($self, $key, $value) = @_;
414 my @allowed = qw(1 2 3 all);
416 if (!grep {$key eq $_} @allowed ) {
417 $self->warn ("invalid key! - must be one of [ ". (join " ", @allowed) . "]");
420 $self->{'_coding_gc'}{$key} = $value;
428 Usage : my $sp = $cut->species();
429 Purpose : Get/setter for species name of codon table
430 Returns : Void or species name string
431 Args : None or species name string
438 $self->{'_species'} = shift;
440 return $self->{'_species'} || "unknown";
446 Usage : my $sp = $cut->genetic_code();
447 Purpose : Get/setter for genetic_code name of codon table
448 Returns : Void or genetic_code id, 1 by default
449 Args : None or genetic_code id, 1 by default if invalid argument.
457 if ($val < 0 || $val >16 || $val =~ /[^\d]/
458 || $val ==7 || $val ==8) {
459 $self->warn ("invalid genetic code - must be 1-16 but not 7 or 8,setting to default [1]");
460 $self->{'_genetic_code'} = 1;
463 $self->{'_genetic_code'} = shift;
466 return $self->{'_genetic_code'} || 1;
472 Usage : my $count = $cdtable->cds_count();
473 Purpose : To retrieve the total number of CDSs used to generate the Codon Table
476 Args : none (if retrieving the value) or an integer( if setting ).
485 $self->warn("can't have negative count initializing to 1");
486 $self->{'_cds_count'} = 0.00;
489 $self->{'_cds_count'} = $val;
492 $self->warn("cds_count value is undefined, returning 0")
493 if !exists($self->{'_cds_count'});
495 return $self->{'_cds_count'} || 0.00;
501 Usage : my $freq = $cdtable->aa_frequency('Leu');
502 Purpose : To retrieve the frequency of an amino acid in the organism
503 Returns : a percentage
504 Args : a 1 letter or 3 letter string representing the amino acid
514 ## deal with cases ##
516 $aa =~ s/^(\w)/\U$1/;
517 if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils
::ONECODE
{$aa}) ) {
518 $self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
521 #translate to 3 letter code for Ctable #
522 my $aa3 = $Bio::SeqUtils
::THREECODE
{$aa} || $aa;
524 ## return % of all amino acids in organism ##
526 map {$freq += $self->{'_table'}{$aa3}{$_}{'per1000'} } keys %{$self->{'_table'}{$aa3}};
527 return sprintf("%.2f", $freq/10);
533 Usage : my $freq = $cdtable->common_codon('Leu');
534 Purpose : To retrieve the frequency of the most common codon of that aa
535 Returns : a percentage
536 Args : a 1 letter or 3 letter string representing the amino acid
544 $aa =~ s/^(\w)/\U$1/;
546 if ($self->_check_aa($aa)) {
547 my $aa3 = $Bio::SeqUtils
::THREECODE
{$aa} ;
550 for my $cod (keys %{$self->{'_table'}{$aa3}}) {
551 $max = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} > $max) ?
552 $self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$max;
561 Usage : my $freq = $cdtable->rare_codon('Leu');
562 Purpose : To retrieve the frequency of the least common codon of that aa
563 Returns : a percentage
564 Args : a 1 letter or 3 letter string representing the amino acid
571 $aa =~ s/^(\w)/\U$1/;
572 if ($self->_check_aa($aa)) {
573 my $aa3 = $Bio::SeqUtils
::THREECODE
{$aa};
576 for my $cod (keys %{$self->{'_table'}{$aa3}}) {
577 $min = ($self->{'_table'}{$aa3}{$cod}{'rel_freq'} < $min) ?
578 $self->{'_table'}{$aa3}{$cod}{'rel_freq'}:$min;
587 ## internal sub that checks a codon is correct format
589 my ($self, $aa ) = @_;
590 if (!exists($STRICTAA{$aa}) && !exists($Bio::SeqUtils
::ONECODE
{$aa}) ) {
591 $self->warn("Invalid amino acid! must be a unique 1 letter or 3 letter identifier");
600 my ($self, $cod) = @_;
601 if ($cod =~ /[^ATCG]/ || $cod !~ /\w\w\w/) {
602 $self->warn(" impossible codon - must be 3 letters and just containing ATCG");
609 ## make hash based on aa and then send to _init_from_aa
610 my ($self, $ref) = @_;
611 my $ct = Bio
::Tools
::CodonTable
->new();
613 for my $codon(keys %$ref ) {
614 my $aa = $ct->translate($codon);
615 $aa_hash{$aa}{$codon} = $ref->{$codon};
617 $self->_init_from_aa(\
%aa_hash);
622 my ($self, $ref) = @_;
623 ## abs counts and count codons
624 my $total_codons = 0;
626 map{$threeletter{$Bio::SeqUtils
::THREECODE
{$_}} = $ref->{$_} } keys %$ref;
627 $ref = \
%threeletter;
628 for my $aa (keys %$ref) {
629 for my $cod(keys %{$ref->{$aa}} ) {
630 $self->{'_table'}{$aa}{$cod}{'abs_count'} = $ref->{$aa}{$cod};
631 $total_codons += $ref->{$aa}{$cod};
635 ## now calculate abs codon frequencies
636 for my $aa (keys %$ref) {
637 for my $cod(keys %{$ref->{$aa}} ) {
638 $self->{'_table'}{$aa}{$cod}{'per1000'} =
639 sprintf("%.2f",$ref->{$aa}{$cod} /$total_codons * 1000) ;
642 ## now calculate rel codon_frequencies
643 for my $aa (keys %$ref) {
645 map{$aa_freq += $ref->{$aa}{$_} }
647 for my $cod(keys %{$ref->{$aa}} ) {
648 $self->{'_table'}{$aa}{$cod}{'rel_freq'}=
649 sprintf("%.2f",$ref->{$aa}{$cod}/ $aa_freq );
654 ## now calculate gc fields
656 for my $aa (keys %$ref) {
657 for my $cod(keys %{$ref->{$aa}} ) {
658 for my $index (qw(1 2 3) ) {
659 if (substr ($cod, $index -1, 1) =~ /g|c/oi) {
660 $GC{$index} += $ref->{$aa}{$cod};
666 map{$tot += $GC{$_}} qw(1 2 3);
667 $self->set_coding_gc('all', $tot/(3 *$total_codons) * 100);
668 map{$self->set_coding_gc($_,$GC{$_}/$total_codons * 100)} qw(1 2 3);
676 return $self->{'_gd_db'} || "unknown";