Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / SeqStats.pm
blob1ea07d49ffbf23d5b0b43264817d34752ee28ddd
2 # BioPerl module for Bio::Tools::SeqStats
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by
8 # Copyright Peter Schattner
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::SeqStats - Object holding statistics for one
17 particular sequence
19 =head1 SYNOPSIS
21 # build a primary nucleic acid or protein sequence object somehow
22 # then build a statistics object from the sequence object
24 $seqobj = Bio::PrimarySeq->new(-seq => 'ACTGTGGCGTCAACTG',
25 -alphabet => 'dna',
26 -id => 'test');
27 $seq_stats = Bio::Tools::SeqStats->new(-seq => $seqobj);
29 # obtain a hash of counts of each type of monomer
30 # (i.e. amino or nucleic acid)
31 print "\nMonomer counts using statistics object\n";
32 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
33 $hash_ref = $seq_stats->count_monomers(); # e.g. for DNA sequence
34 foreach $base (sort keys %$hash_ref) {
35 print "Number of bases of type ", $base, "= ",
36 %$hash_ref->{$base},"\n";
39 # obtain the count directly without creating a new statistics object
40 print "\nMonomer counts without statistics object\n";
41 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
42 foreach $base (sort keys %$hash_ref) {
43 print "Number of bases of type ", $base, "= ",
44 %$hash_ref->{$base},"\n";
48 # obtain hash of counts of each type of codon in a nucleic acid sequence
49 print "\nCodon counts using statistics object\n";
50 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
51 foreach $base (sort keys %$hash_ref) {
52 print "Number of codons of type ", $base, "= ",
53 %$hash_ref->{$base},"\n";
56 # or
57 print "\nCodon counts without statistics object\n";
58 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
59 foreach $base (sort keys %$hash_ref) {
60 print "Number of codons of type ", $base, "= ",
61 %$hash_ref->{$base},"\n";
64 # Obtain the molecular weight of a sequence. Since the sequence
65 # may contain ambiguous monomers, the molecular weight is returned
66 # as a (reference to) a two element array containing greatest lower
67 # bound (GLB) and least upper bound (LUB) of the molecular weight
68 $weight = $seq_stats->get_mol_wt();
69 print "\nMolecular weight (using statistics object) of sequence ",
70 $seqobj->id(), " is between ", $$weight[0], " and " ,
71 $$weight[1], "\n";
73 # or
74 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
75 print "\nMolecular weight (without statistics object) of sequence ",
76 $seqobj->id(), " is between ", $$weight[0], " and " ,
77 $$weight[1], "\n";
79 # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
80 my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
81 -alphabet=>'protein');
82 my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
83 print "might be hydropathic" if $gravy > 1;
85 =head1 DESCRIPTION
87 Bio::Tools::SeqStats is a lightweight object for the calculation of
88 simple statistical and numerical properties of a sequence. By
89 "lightweight" I mean that only "primary" sequences are handled by the
90 object. The calling script needs to create the appropriate primary
91 sequence to be passed to SeqStats if statistics on a sequence feature
92 are required. Similarly if a codon count is desired for a
93 frame-shifted sequence and/or a negative strand sequence, the calling
94 script needs to create that sequence and pass it to the SeqStats
95 object.
97 Nota that nucleotide sequences in bioperl do not strictly separate RNA
98 and DNA sequences. By convention, sequences from RNA molecules are
99 shown as is they were DNA. Objects are supposed to make the
100 distinction when needed. This class is one of the few where this
101 distinctions needs to be made. Internally, it changes all Ts into Us
102 before weight and monomer count.
104 SeqStats can be called in two distinct manners. If only a single
105 computation is required on a given sequence object, the method can be
106 called easily using the SeqStats object directly:
108 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
110 Alternately, if several computations will be required on a given
111 sequence object, an "instance" statistics object can be constructed
112 and used for the method calls:
114 $seq_stats = Bio::Tools::SeqStats->new($seqobj);
115 $monomers = $seq_stats->count_monomers();
116 $codons = $seq_stats->count_codons();
117 $weight = $seq_stats->get_mol_wt();
118 $gravy = $seq_stats->hydropathicity();
120 As currently implemented the object can return the following values
121 from a sequence:
123 =over
125 =item *
127 The molecular weight of the sequence: get_mol_wt()
129 =item *
131 The number of each type of monomer present: count_monomers()
133 =item *
135 The number of each codon present in a nucleic acid sequence:
136 count_codons()
138 =item *
140 The mean hydropathicity ("gravy" score) of a protein:
141 hydropathicity()
143 =back
145 For DNA and RNA sequences single-stranded weights are returned. The
146 molecular weights are calculated for neutral, or not ionized,
147 nucleic acids. The returned weight is the sum of the
148 base-sugar-phosphate residues of the chain plus one weight of water to
149 to account for the additional OH on the phosphate of the 5' residue
150 and the additional H on the sugar ring of the 3' residue. Note that
151 this leads to a difference of 18 in calculated molecular weights
152 compared to some other available programs (e.g. Informax VectorNTI).
154 Note that since sequences may contain ambiguous monomers (e.g. "M",
155 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
156 returns a two-element array containing the greatest lower bound and
157 least upper bound of the molecule. For a sequence with no ambiguous
158 monomers, the two elements of the returned array will be equal. The
159 method count_codons() handles ambiguous bases by simply counting all
160 ambiguous codons together and issuing a warning to that effect.
163 =head1 DEVELOPERS NOTES
165 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
167 Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
168 molecular weights to have the output match values in SWISS-PROT.
170 Torsten added hydropathicity calculation.
172 =head1 FEEDBACK
174 =head2 Mailing Lists
176 User feedback is an integral part of the evolution of this and other
177 Bioperl modules. Send your comments and suggestions preferably to one
178 of the Bioperl mailing lists. Your participation is much appreciated.
180 bioperl-l@bioperl.org - General discussion
181 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
183 =head2 Support
185 Please direct usage questions or support issues to the mailing list:
187 I<bioperl-l@bioperl.org>
189 rather than to the module maintainer directly. Many experienced and
190 reponsive experts will be able look at the problem and quickly
191 address it. Please include a thorough description of the problem
192 with code and data examples if at all possible.
194 =head2 Reporting Bugs
196 Report bugs to the Bioperl bug tracking system to help us keep track
197 the bugs and their resolution. Bug reports can be submitted the web:
199 https://github.com/bioperl/bioperl-live/issues
201 =head1 AUTHOR - Peter Schattner
203 Email schattner AT alum.mit.edu
205 =head1 CONTRIBUTOR - Torsten Seemann
207 Email torsten.seemann AT infotech.monash.edu.au
209 =head1 APPENDIX
211 The rest of the documentation details each of the object
212 methods. Internal methods are usually preceded with a _
214 =cut
217 package Bio::Tools::SeqStats;
218 use strict;
219 use vars qw(%Alphabets %Alphabets_strict $amino_weights
220 $rna_weights $dna_weights %Weights $amino_hydropathicity);
221 use Bio::Seq;
222 use base qw(Bio::Root::Root);
224 BEGIN {
225 %Alphabets = (
226 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
227 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
228 'protein' => [ qw(A R N D C Q E G H I L K M F U
229 P S T W X Y V B Z J O *) ], # sac: added B, Z
232 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
233 %Alphabets_strict = (
234 'dna' => [ qw( A C G T ) ],
235 'rna' => [ qw( A C G U ) ],
236 'protein' => [ qw(A R N D C Q E G H I L K M F U
237 P S T W Y V O) ],
241 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
242 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
244 # Amino Acid alphabet
246 # ------------------------------------------
247 # Symbol Meaning
248 # ------------------------------------------
250 my $amino_A_wt = 89.09;
251 my $amino_C_wt = 121.15;
252 my $amino_D_wt = 133.1;
253 my $amino_E_wt = 147.13;
254 my $amino_F_wt = 165.19;
255 my $amino_G_wt = 75.07;
256 my $amino_H_wt = 155.16;
257 my $amino_I_wt = 131.17;
258 my $amino_K_wt = 146.19;
259 my $amino_L_wt = 131.17;
260 my $amino_M_wt = 149.21;
261 my $amino_N_wt = 132.12;
262 my $amino_O_wt = 255.31;
263 my $amino_P_wt = 115.13;
264 my $amino_Q_wt = 146.15;
265 my $amino_R_wt = 174.20;
266 my $amino_S_wt = 105.09;
267 my $amino_T_wt = 119.12;
268 my $amino_U_wt = 168.06;
269 my $amino_V_wt = 117.15;
270 my $amino_W_wt = 204.23;
271 my $amino_Y_wt = 181.19;
274 $amino_weights = {
275 'A' => [$amino_A_wt, $amino_A_wt], # Alanine
276 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
277 'C' => [$amino_C_wt, $amino_C_wt], # Cysteine
278 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
279 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
280 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
281 'G' => [$amino_G_wt, $amino_G_wt], # Glycine
282 'H' => [$amino_H_wt, $amino_H_wt], # Histidine
283 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
284 'J' => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
285 'K' => [$amino_K_wt, $amino_K_wt], # Lysine
286 'L' => [$amino_L_wt, $amino_L_wt], # Leucine
287 'M' => [$amino_M_wt, $amino_M_wt], # Methionine
288 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
289 'O' => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
290 'P' => [$amino_P_wt, $amino_P_wt], # Proline
291 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
292 'R' => [$amino_R_wt, $amino_R_wt], # Arginine
293 'S' => [$amino_S_wt, $amino_S_wt], # Serine
294 'T' => [$amino_T_wt, $amino_T_wt], # Threonine
295 'U' => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
296 'V' => [$amino_V_wt, $amino_V_wt], # Valine
297 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
298 'X' => [$amino_G_wt, $amino_W_wt], # Unknown
299 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
300 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
303 # Extended Dna / Rna alphabet
304 use vars ( qw($C $O $N $H $P $water) );
305 use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
306 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
307 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
308 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
309 use vars ( qw($dna_weights $rna_weights %Weights));
311 $C = 12.01;
312 $O = 16.00;
313 $N = 14.01;
314 $H = 1.01;
315 $P = 30.97;
316 $water = 18.015;
318 $adenine = 5 * $C + 5 * $N + 5 * $H;
319 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
320 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
321 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
322 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
324 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
325 # neutral (unionized) form
326 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
328 # the following are single strand molecular weights / base
329 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
330 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
331 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
332 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
334 $rna_A_wt = $adenine + $ribose_phosphate - $water;
335 $rna_C_wt = $cytosine + $ribose_phosphate - $water;
336 $rna_G_wt = $guanine + $ribose_phosphate - $water;
337 $rna_U_wt = $uracil + $ribose_phosphate - $water;
339 $dna_weights = {
340 'A' => [$dna_A_wt,$dna_A_wt], # Adenine
341 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
342 'G' => [$dna_G_wt,$dna_G_wt], # Guanine
343 'T' => [$dna_T_wt,$dna_T_wt], # Thymine
344 'M' => [$dna_C_wt,$dna_A_wt], # A or C
345 'R' => [$dna_A_wt,$dna_G_wt], # A or G
346 'W' => [$dna_T_wt,$dna_A_wt], # A or T
347 'S' => [$dna_C_wt,$dna_G_wt], # C or G
348 'Y' => [$dna_C_wt,$dna_T_wt], # C or T
349 'K' => [$dna_T_wt,$dna_G_wt], # G or T
350 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
351 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
352 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
353 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
354 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
355 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
358 $rna_weights = {
359 'A' => [$rna_A_wt,$rna_A_wt], # Adenine
360 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
361 'G' => [$rna_G_wt,$rna_G_wt], # Guanine
362 'U' => [$rna_U_wt,$rna_U_wt], # Uracil
363 'M' => [$rna_C_wt,$rna_A_wt], # A or C
364 'R' => [$rna_A_wt,$rna_G_wt], # A or G
365 'W' => [$rna_U_wt,$rna_A_wt], # A or U
366 'S' => [$rna_C_wt,$rna_G_wt], # C or G
367 'Y' => [$rna_C_wt,$rna_U_wt], # C or U
368 'K' => [$rna_U_wt,$rna_G_wt], # G or U
369 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
370 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
371 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
372 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
373 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
374 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
377 %Weights = (
378 'dna' => $dna_weights,
379 'rna' => $rna_weights,
380 'protein' => $amino_weights,
383 # Amino acid scale: Hydropathicity.
384 # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
385 # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
387 $amino_hydropathicity = {
388 A => 1.800,
389 R => -4.500,
390 N => -3.500,
391 D => -3.500,
392 C => 2.500,
393 Q => -3.500,
394 E => -3.500,
395 G => -0.400,
396 H => -3.200,
397 I => 4.500,
398 L => 3.800,
399 K => -3.900,
400 M => 1.900,
401 F => 2.800,
402 P => -1.600,
403 S => -0.800,
404 T => -0.700,
405 W => -0.900,
406 Y => -1.300,
407 V => 4.200,
412 sub new {
413 my($class,@args) = @_;
414 my $self = $class->SUPER::new(@args);
416 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
417 unless ($seqobj->isa("Bio::PrimarySeqI")) {
418 $self->throw("SeqStats works only on PrimarySeqI objects");
420 if ( !defined $seqobj->alphabet ||
421 !defined $Alphabets{$seqobj->alphabet}) {
422 $self->throw("Must have a valid alphabet defined for seq (".
423 join(",",keys %Alphabets));
425 $self->{'_seqref'} = $seqobj;
426 # check the letters in the sequence
427 $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
428 return $self;
431 =head2 count_monomers
433 Title : count_monomers
434 Usage : $rcount = $seq_stats->count_monomers();
435 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
436 Function: Counts the number of each type of monomer (amino acid or
437 base) in the sequence.
438 Ts are counted as Us in RNA sequences.
439 Example :
440 Returns : Reference to a hash in which keys are letters of the
441 genetic alphabet used and values are number of occurrences
442 of the letter in the sequence.
443 Args : None or reference to sequence object
444 Throws : Throws an exception if type of sequence is unknown (ie amino
445 or nucleic)or if unknown letter in alphabet. Ambiguous
446 elements are allowed.
448 =cut
450 sub count_monomers{
451 my %count = ();
452 my $seqobj;
453 my $_is_strict;
454 my $element = '';
455 my $_is_instance = 1 ;
456 my $self = shift @_;
457 my $object_argument = shift @_;
459 # First we need to determine if the present object is an instance
460 # object or if the sequence object has been passed as an argument
462 if (defined $object_argument) {
463 $_is_instance = 0;
466 # If we are using an instance object...
467 if ($_is_instance) {
468 if ($self->{'_monomer_count'}) {
469 return $self->{'_monomer_count'}; # return count if previously calculated
471 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
472 $seqobj = $self->{'_seqref'};
473 } else {
474 # otherwise...
475 $seqobj = $object_argument;
477 # Following two lines lead to error in "throw" routine
478 $seqobj->isa("Bio::PrimarySeqI") ||
479 $self->throw("SeqStats works only on PrimarySeqI objects");
480 # is alphabet OK? Is it strict?
481 $_is_strict = _is_alphabet_strict($seqobj);
484 my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
485 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters
487 # convert everything to upper case to be safe
488 my $seqstring = uc $seqobj->seq();
490 # Since T is used in RichSeq RNA sequences, do conversion locally
491 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
493 # For each letter, count the number of times it appears in
494 # the sequence
495 LETTER:
496 foreach $element (@$alphabet) {
497 # skip terminator symbol which may confuse regex
498 next LETTER if $element eq '*';
499 $count{$element} = ( $seqstring =~ s/$element/$element/g);
502 if ($_is_instance) {
503 $self->{'_monomer_count'} = \%count; # Save in case called again later
506 return \%count;
509 =head2 get_mol_wt
511 Title : get_mol_wt
512 Usage : $wt = $seqobj->get_mol_wt() or
513 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
514 Function: Calculate molecular weight of sequence
515 Ts are counted as Us in RNA sequences.
516 Example :
518 Returns : Reference to two element array containing lower and upper
519 bounds of molecule molecular weight. For DNA and RNA
520 sequences single-stranded weights are returned. If
521 sequence contains no ambiguous elements, both entries in
522 array are equal to molecular weight of molecule.
523 Args : None or reference to sequence object
524 Throws : Exception if type of sequence is unknown (ie not amino or
525 nucleic) or if unknown letter in alphabet. Ambiguous
526 elements are allowed.
528 =cut
530 sub get_mol_wt {
531 my $seqobj;
532 my $_is_strict;
533 my $element = '';
534 my $_is_instance = 1 ;
535 my $self = shift @_;
536 my $object_argument = shift @_;
537 my ($weight_array, $rcount);
539 if (defined $object_argument) {
540 $_is_instance = 0;
543 if ($_is_instance) {
544 if ($weight_array = $self->{'_mol_wt'}) {
545 # return mol. weight if previously calculated
546 return $weight_array;
548 $seqobj = $self->{'_seqref'};
549 $rcount = $self->count_monomers();
550 } else {
551 $seqobj = $object_argument;
552 $seqobj->isa("Bio::PrimarySeqI") ||
553 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
554 $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK?
555 $rcount = $self->count_monomers($seqobj);
558 # We will also need to know what type of monomer we are dealing with
559 my $moltype = $seqobj->alphabet();
561 # In general,the molecular weight is bounded below by the sum of the
562 # weights of lower bounds of each alphabet symbol times the number of
563 # occurrences of the symbol in the sequence. A similar upper bound on
564 # the weight is also calculated.
566 # Note that for "strict" (i.e. unambiguous) sequences there is an
567 # inefficiency since the upper bound = the lower bound and there are
568 # two calculations. However, this decrease in performance will be
569 # minor and leads to significantly more readable code.
571 my $weight_lower_bound = 0;
572 my $weight_upper_bound = 0;
573 my $weight_table = $Weights{$moltype};
574 my $total_res;
576 # compute weight of all the residues
577 foreach $element (keys %$rcount) {
578 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
579 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
581 # this tracks only the residues used for counting MW
582 $total_res += $$rcount{$element};
584 if ($moltype =~ /protein/) {
585 # remove H2O during peptide bond formation.
586 $weight_lower_bound -= $water * ($total_res - 1);
587 $weight_upper_bound -= $water * ($total_res - 1);
588 } else {
589 # Correction because phosphate of 5' residue has additional OH and
590 # sugar ring of 3' residue has additional H
591 $weight_lower_bound += $water;
592 $weight_upper_bound += $water;
595 $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
596 $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
598 $weight_array = [$weight_lower_bound, $weight_upper_bound];
600 if ($_is_instance) {
601 $self->{'_mol_wt'} = $weight_array; # Save in case called again later
603 return $weight_array;
607 =head2 count_codons
609 Title : count_codons
610 Usage : $rcount = $seqstats->count_codons() or
611 $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
612 Function: Counts the number of each type of codons for a dna or rna
613 sequence, starting at the 1st triple of the input sequence.
614 Example :
615 Returns : Reference to a hash in which keys are codons of the genetic
616 alphabet used and values are number of occurrences of the
617 codons in the sequence. All codons with "ambiguous" bases
618 are counted together.
619 Args : None or sequence object
620 Throws : an exception if type of sequence is unknown or protein.
622 =cut
624 sub count_codons {
625 my $rcount = {};
626 my $codon ;
627 my $seqobj;
628 my $_is_strict;
629 my $element = '';
630 my $_is_instance = 1 ;
631 my $self = shift @_;
632 my $object_argument = shift @_;
634 if (defined $object_argument) {
635 $_is_instance = 0;
638 if ($_is_instance) {
639 if ($rcount = $self->{'_codon_count'}) {
640 return $rcount; # return count if previously calculated
642 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
643 $seqobj = $self->{'_seqref'};
644 } else {
645 $seqobj = $object_argument;
646 $seqobj->isa("Bio::PrimarySeqI") ||
647 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
648 $_is_strict = _is_alphabet_strict($seqobj);
651 # Codon counts only make sense for nucleic acid sequences
652 my $alphabet = $seqobj->alphabet();
654 unless ($alphabet =~ /[dr]na/i) {
655 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
656 "not for $alphabet sequences.");
659 # If sequence contains ambiguous bases, warn that codons
660 # containing them will all be lumped together in the count.
662 if (!$_is_strict ) {
663 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
664 " All codons with ambiguous bases will be added together in count.")
665 if $self->verbose >= 0 ;
668 my $seq = $seqobj->seq();
670 # Now step through the string by threes and count the codons
672 CODON:
673 while (length($seq) > 2) {
674 $codon = uc substr($seq,0,3);
675 $seq = substr($seq,3);
676 if ($codon =~ /[^ACTGU]/i) {
677 $$rcount{'ambiguous'}++; #lump together ambiguous codons
678 next CODON;
680 if (!defined $$rcount{$codon}) {
681 $$rcount{$codon}= 1 ;
682 next CODON;
684 $$rcount{$codon}++; # default
687 if ($_is_instance) {
688 $self->{'_codon_count'} = $rcount; # Save in case called again later
691 return $rcount;
695 =head2 hydropathicity
697 Title : hydropathicity
698 Usage : $gravy = $seqstats->hydropathicity(); or
699 $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
701 Function: Calculates the mean Kyte-Doolittle hydropathicity for a
702 protein sequence. Also known as the "gravy" score. Refer to
703 Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
704 Example :
705 Returns : float
706 Args : None or reference to sequence object
708 Throws : an exception if type of sequence is not protein.
710 =cut
712 sub hydropathicity {
713 my $seqobj;
714 my $_is_strict;
715 my $element = '';
716 my $_is_instance = 1 ;
717 my $self = shift @_;
718 my $object_argument = shift @_;
720 if (defined $object_argument) {
721 $_is_instance = 0;
724 if ($_is_instance) {
725 if (my $gravy = $self->{'_hydropathicity'}) {
726 return $gravy; # return value if previously calculated
728 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
729 $seqobj = $self->{'_seqref'};
730 } else {
731 $seqobj = $object_argument;
732 $seqobj->isa("Bio::PrimarySeqI") ||
733 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
734 $_is_strict = _is_alphabet_strict($seqobj);
737 # hydropathicity not menaingful for empty sequences
738 unless ($seqobj->length() > 0) {
739 $seqobj->throw("hydropathicity not defined for zero-length sequences");
742 # hydropathicity only make sense for protein sequences
743 my $alphabet = $seqobj->alphabet();
745 unless ($alphabet =~ /protein/i) {
746 $seqobj->throw("hydropathicity only meaningful for protein, ".
747 "not for $alphabet sequences.");
750 # If sequence contains ambiguous bases, warn that codons
751 # containing them will all be lumped together in the count.
753 unless ($_is_strict ) {
754 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
755 "Hydropathicity can not be caculated.")
758 my $seq = $seqobj->seq();
760 # Now step through the string and add up the hydropathicity values
762 my $gravy = 0;
763 for my $i ( 0 .. length($seq) ) {
764 my $codon = uc(substr($seq,$i,1));
765 $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
767 $gravy /= length($seq);
770 if ($_is_instance) {
771 $self->{'_hydropathicity'} = $gravy; # Save in case called again later
774 return $gravy;
778 =head2 _is_alphabet_strict
780 Title : _is_alphabet_strict
781 Usage :
782 Function: internal function to determine whether there are
783 any ambiguous elements in the current sequence
784 Example :
785 Returns : 1 if strict alphabet is being used,
786 0 if ambiguous elements are present
787 Args :
789 Throws : an exception if type of sequence is unknown (ie amino or
790 nucleic) or if unknown letter in alphabet. Ambiguous
791 monomers are allowed.
793 =cut
795 sub _is_alphabet_strict {
797 my ($seqobj) = @_;
798 my $moltype = $seqobj->alphabet();
800 # convert everything to upper case to be safe
801 my $seqstring = uc $seqobj->seq();
803 # Since T is used in RichSeq RNA sequences, do conversion locally
804 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
806 # First we check if only the 'strict' letters are present in the
807 # sequence string If not, we check whether the remaining letters
808 # are ambiguous monomers or whether there are illegal letters in
809 # the string
811 # $alpha_array is a ref to an array of the 'strictly' allowed letters
812 my $alpha_array = $Alphabets_strict{$moltype} ;
814 # $alphabet contains the allowed letters in string form
815 my $alphabet = join ('', @$alpha_array) ;
816 unless ($seqstring =~ /[^$alphabet]/) {
817 return 1 ;
820 # Next try to match with the alphabet's ambiguous letters
821 $alpha_array = $Alphabets{$moltype} ;
822 $alphabet = join ('', @$alpha_array) ;
824 unless ($seqstring =~ /[^$alphabet]/) {
825 return 0 ;
828 # If we got here there is an illegal letter in the sequence
829 $seqobj->throw("Alphabet not OK for $seqobj");
832 =head2 _print_data
834 Title : _print_data
835 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
836 Function: Displays dna / rna parameters (used for debugging)
837 Returns : 1
838 Args : None
840 Used for debugging.
842 =cut
844 sub _print_data {
846 print "\n adenine = : $adenine \n";
847 print "\n guanine = : $guanine \n";
848 print "\n cytosine = : $cytosine \n";
849 print "\n thymine = : $thymine \n";
850 print "\n uracil = : $uracil \n";
852 print "\n dna_A_wt = : $dna_A_wt \n";
853 print "\n dna_C_wt = : $dna_C_wt \n";
854 print "\n dna_G_wt = : $dna_G_wt \n";
855 print "\n dna_T_wt = : $dna_T_wt \n";
857 print "\n rna_A_wt = : $rna_A_wt \n";
858 print "\n rna_C_wt = : $rna_C_wt \n";
859 print "\n rna_G_wt = : $rna_G_wt \n";
860 print "\n rna_U_wt = : $rna_U_wt \n";
862 return 1;