2 # BioPerl module for Bio::Tools::SeqStats
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Copyright Peter Schattner
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::SeqStats - Object holding statistics for one
21 # build a primary nucleic acid or protein sequence object somehow
22 # then build a statistics object from the sequence object
24 $seqobj = Bio::PrimarySeq->new(-seq => 'ACTGTGGCGTCAACTG',
27 $seq_stats = Bio::Tools::SeqStats->new(-seq => $seqobj);
29 # obtain a hash of counts of each type of monomer
30 # (i.e. amino or nucleic acid)
31 print "\nMonomer counts using statistics object\n";
32 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
33 $hash_ref = $seq_stats->count_monomers(); # e.g. for DNA sequence
34 foreach $base (sort keys %$hash_ref) {
35 print "Number of bases of type ", $base, "= ",
36 %$hash_ref->{$base},"\n";
39 # obtain the count directly without creating a new statistics object
40 print "\nMonomer counts without statistics object\n";
41 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
42 foreach $base (sort keys %$hash_ref) {
43 print "Number of bases of type ", $base, "= ",
44 %$hash_ref->{$base},"\n";
48 # obtain hash of counts of each type of codon in a nucleic acid sequence
49 print "\nCodon counts using statistics object\n";
50 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
51 foreach $base (sort keys %$hash_ref) {
52 print "Number of codons of type ", $base, "= ",
53 %$hash_ref->{$base},"\n";
57 print "\nCodon counts without statistics object\n";
58 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
59 foreach $base (sort keys %$hash_ref) {
60 print "Number of codons of type ", $base, "= ",
61 %$hash_ref->{$base},"\n";
64 # Obtain the molecular weight of a sequence. Since the sequence
65 # may contain ambiguous monomers, the molecular weight is returned
66 # as a (reference to) a two element array containing greatest lower
67 # bound (GLB) and least upper bound (LUB) of the molecular weight
68 $weight = $seq_stats->get_mol_wt();
69 print "\nMolecular weight (using statistics object) of sequence ",
70 $seqobj->id(), " is between ", $$weight[0], " and " ,
74 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
75 print "\nMolecular weight (without statistics object) of sequence ",
76 $seqobj->id(), " is between ", $$weight[0], " and " ,
79 # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
80 my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
81 -alphabet=>'protein');
82 my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
83 print "might be hydropathic" if $gravy > 1;
87 Bio::Tools::SeqStats is a lightweight object for the calculation of
88 simple statistical and numerical properties of a sequence. By
89 "lightweight" I mean that only "primary" sequences are handled by the
90 object. The calling script needs to create the appropriate primary
91 sequence to be passed to SeqStats if statistics on a sequence feature
92 are required. Similarly if a codon count is desired for a
93 frame-shifted sequence and/or a negative strand sequence, the calling
94 script needs to create that sequence and pass it to the SeqStats
97 Nota that nucleotide sequences in bioperl do not strictly separate RNA
98 and DNA sequences. By convention, sequences from RNA molecules are
99 shown as is they were DNA. Objects are supposed to make the
100 distinction when needed. This class is one of the few where this
101 distinctions needs to be made. Internally, it changes all Ts into Us
102 before weight and monomer count.
104 SeqStats can be called in two distinct manners. If only a single
105 computation is required on a given sequence object, the method can be
106 called easily using the SeqStats object directly:
108 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
110 Alternately, if several computations will be required on a given
111 sequence object, an "instance" statistics object can be constructed
112 and used for the method calls:
114 $seq_stats = Bio::Tools::SeqStats->new($seqobj);
115 $monomers = $seq_stats->count_monomers();
116 $codons = $seq_stats->count_codons();
117 $weight = $seq_stats->get_mol_wt();
118 $gravy = $seq_stats->hydropathicity();
120 As currently implemented the object can return the following values
127 The molecular weight of the sequence: get_mol_wt()
131 The number of each type of monomer present: count_monomers()
135 The number of each codon present in a nucleic acid sequence:
140 The mean hydropathicity ("gravy" score) of a protein:
145 For DNA and RNA sequences single-stranded weights are returned. The
146 molecular weights are calculated for neutral, or not ionized,
147 nucleic acids. The returned weight is the sum of the
148 base-sugar-phosphate residues of the chain plus one weight of water to
149 to account for the additional OH on the phosphate of the 5' residue
150 and the additional H on the sugar ring of the 3' residue. Note that
151 this leads to a difference of 18 in calculated molecular weights
152 compared to some other available programs (e.g. Informax VectorNTI).
154 Note that since sequences may contain ambiguous monomers (e.g. "M",
155 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
156 returns a two-element array containing the greatest lower bound and
157 least upper bound of the molecule. For a sequence with no ambiguous
158 monomers, the two elements of the returned array will be equal. The
159 method count_codons() handles ambiguous bases by simply counting all
160 ambiguous codons together and issuing a warning to that effect.
163 =head1 DEVELOPERS NOTES
165 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
167 Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
168 molecular weights to have the output match values in SWISS-PROT.
170 Torsten added hydropathicity calculation.
176 User feedback is an integral part of the evolution of this and other
177 Bioperl modules. Send your comments and suggestions preferably to one
178 of the Bioperl mailing lists. Your participation is much appreciated.
180 bioperl-l@bioperl.org - General discussion
181 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
185 Please direct usage questions or support issues to the mailing list:
187 I<bioperl-l@bioperl.org>
189 rather than to the module maintainer directly. Many experienced and
190 reponsive experts will be able look at the problem and quickly
191 address it. Please include a thorough description of the problem
192 with code and data examples if at all possible.
194 =head2 Reporting Bugs
196 Report bugs to the Bioperl bug tracking system to help us keep track
197 the bugs and their resolution. Bug reports can be submitted the web:
199 https://github.com/bioperl/bioperl-live/issues
201 =head1 AUTHOR - Peter Schattner
203 Email schattner AT alum.mit.edu
205 =head1 CONTRIBUTOR - Torsten Seemann
207 Email torsten.seemann AT infotech.monash.edu.au
211 The rest of the documentation details each of the object
212 methods. Internal methods are usually preceded with a _
217 package Bio
::Tools
::SeqStats
;
220 use vars
qw(%Alphabets %Alphabets_strict $amino_weights
221 $rna_weights $dna_weights %Weights $amino_hydropathicity);
223 use base qw(Bio::Root::Root);
227 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
228 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
229 'protein' => [ qw(A R N D C Q E G H I L K M F U
230 P S T W X Y V B Z J O *) ], # sac: added B, Z
233 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
234 %Alphabets_strict = (
235 'dna' => [ qw( A C G T ) ],
236 'rna' => [ qw( A C G U ) ],
237 'protein' => [ qw(A R N D C Q E G H I L K M F U
242 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
243 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
245 # Amino Acid alphabet
247 # ------------------------------------------
249 # ------------------------------------------
251 my $amino_A_wt = 89.09;
252 my $amino_C_wt = 121.15;
253 my $amino_D_wt = 133.1;
254 my $amino_E_wt = 147.13;
255 my $amino_F_wt = 165.19;
256 my $amino_G_wt = 75.07;
257 my $amino_H_wt = 155.16;
258 my $amino_I_wt = 131.17;
259 my $amino_K_wt = 146.19;
260 my $amino_L_wt = 131.17;
261 my $amino_M_wt = 149.21;
262 my $amino_N_wt = 132.12;
263 my $amino_O_wt = 255.31;
264 my $amino_P_wt = 115.13;
265 my $amino_Q_wt = 146.15;
266 my $amino_R_wt = 174.20;
267 my $amino_S_wt = 105.09;
268 my $amino_T_wt = 119.12;
269 my $amino_U_wt = 168.06;
270 my $amino_V_wt = 117.15;
271 my $amino_W_wt = 204.23;
272 my $amino_Y_wt = 181.19;
276 'A' => [$amino_A_wt, $amino_A_wt], # Alanine
277 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
278 'C' => [$amino_C_wt, $amino_C_wt], # Cysteine
279 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
280 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
281 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
282 'G' => [$amino_G_wt, $amino_G_wt], # Glycine
283 'H' => [$amino_H_wt, $amino_H_wt], # Histidine
284 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
285 'J' => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
286 'K' => [$amino_K_wt, $amino_K_wt], # Lysine
287 'L' => [$amino_L_wt, $amino_L_wt], # Leucine
288 'M' => [$amino_M_wt, $amino_M_wt], # Methionine
289 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
290 'O' => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
291 'P' => [$amino_P_wt, $amino_P_wt], # Proline
292 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
293 'R' => [$amino_R_wt, $amino_R_wt], # Arginine
294 'S' => [$amino_S_wt, $amino_S_wt], # Serine
295 'T' => [$amino_T_wt, $amino_T_wt], # Threonine
296 'U' => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
297 'V' => [$amino_V_wt, $amino_V_wt], # Valine
298 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
299 'X' => [$amino_G_wt, $amino_W_wt], # Unknown
300 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
301 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
304 # Extended Dna / Rna alphabet
305 use vars
( qw($C $O $N $H $P $water) );
306 use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
307 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
308 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
309 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
310 use vars ( qw($dna_weights $rna_weights %Weights));
319 $adenine = 5 * $C + 5 * $N + 5 * $H;
320 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
321 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
322 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
323 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
325 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
326 # neutral (unionized) form
327 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
329 # the following are single strand molecular weights / base
330 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
331 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
332 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
333 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
335 $rna_A_wt = $adenine + $ribose_phosphate - $water;
336 $rna_C_wt = $cytosine + $ribose_phosphate - $water;
337 $rna_G_wt = $guanine + $ribose_phosphate - $water;
338 $rna_U_wt = $uracil + $ribose_phosphate - $water;
341 'A' => [$dna_A_wt,$dna_A_wt], # Adenine
342 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
343 'G' => [$dna_G_wt,$dna_G_wt], # Guanine
344 'T' => [$dna_T_wt,$dna_T_wt], # Thymine
345 'M' => [$dna_C_wt,$dna_A_wt], # A or C
346 'R' => [$dna_A_wt,$dna_G_wt], # A or G
347 'W' => [$dna_T_wt,$dna_A_wt], # A or T
348 'S' => [$dna_C_wt,$dna_G_wt], # C or G
349 'Y' => [$dna_C_wt,$dna_T_wt], # C or T
350 'K' => [$dna_T_wt,$dna_G_wt], # G or T
351 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
352 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
353 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
354 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
355 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
356 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
360 'A' => [$rna_A_wt,$rna_A_wt], # Adenine
361 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
362 'G' => [$rna_G_wt,$rna_G_wt], # Guanine
363 'U' => [$rna_U_wt,$rna_U_wt], # Uracil
364 'M' => [$rna_C_wt,$rna_A_wt], # A or C
365 'R' => [$rna_A_wt,$rna_G_wt], # A or G
366 'W' => [$rna_U_wt,$rna_A_wt], # A or U
367 'S' => [$rna_C_wt,$rna_G_wt], # C or G
368 'Y' => [$rna_C_wt,$rna_U_wt], # C or U
369 'K' => [$rna_U_wt,$rna_G_wt], # G or U
370 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
371 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
372 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
373 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
374 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
375 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
379 'dna' => $dna_weights,
380 'rna' => $rna_weights,
381 'protein' => $amino_weights,
384 # Amino acid scale: Hydropathicity.
385 # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
386 # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
388 $amino_hydropathicity = {
414 my($class,@args) = @_;
415 my $self = $class->SUPER::new(@args);
417 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
418 unless ($seqobj->isa("Bio::PrimarySeqI")) {
419 $self->throw("SeqStats works only on PrimarySeqI objects");
421 if ( !defined $seqobj->alphabet ||
422 !defined $Alphabets{$seqobj->alphabet}) {
423 $self->throw("Must have a valid alphabet defined for seq (".
424 join(",",keys %Alphabets));
426 $self->{'_seqref'} = $seqobj;
427 # check the letters in the sequence
428 $self->{'_is_strict'} = _is_alphabet_strict
($seqobj);
432 =head2 count_monomers
434 Title : count_monomers
435 Usage : $rcount = $seq_stats->count_monomers();
436 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
437 Function: Counts the number of each type of monomer (amino acid or
438 base) in the sequence.
439 Ts are counted as Us in RNA sequences.
441 Returns : Reference to a hash in which keys are letters of the
442 genetic alphabet used and values are number of occurrences
443 of the letter in the sequence.
444 Args : None or reference to sequence object
445 Throws : Throws an exception if type of sequence is unknown (ie amino
446 or nucleic)or if unknown letter in alphabet. Ambiguous
447 elements are allowed.
456 my $_is_instance = 1 ;
458 my $object_argument = shift @_;
460 # First we need to determine if the present object is an instance
461 # object or if the sequence object has been passed as an argument
463 if (defined $object_argument) {
467 # If we are using an instance object...
469 if ($self->{'_monomer_count'}) {
470 return $self->{'_monomer_count'}; # return count if previously calculated
472 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
473 $seqobj = $self->{'_seqref'};
476 $seqobj = $object_argument;
478 # Following two lines lead to error in "throw" routine
479 $seqobj->isa("Bio::PrimarySeqI") ||
480 $self->throw("SeqStats works only on PrimarySeqI objects");
481 # is alphabet OK? Is it strict?
482 $_is_strict = _is_alphabet_strict
($seqobj);
485 my $alphabet = $_is_strict ?
$Alphabets_strict{$seqobj->alphabet} :
486 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters
488 # convert everything to upper case to be safe
489 my $seqstring = uc $seqobj->seq();
491 # Since T is used in RichSeq RNA sequences, do conversion locally
492 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
494 # For each letter, count the number of times it appears in
497 foreach $element (@
$alphabet) {
498 # skip terminator symbol which may confuse regex
499 next LETTER
if $element eq '*';
500 $count{$element} = ( $seqstring =~ s/$element/$element/g);
504 $self->{'_monomer_count'} = \
%count; # Save in case called again later
513 Usage : $wt = $seqobj->get_mol_wt() or
514 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
515 Function: Calculate molecular weight of sequence
516 Ts are counted as Us in RNA sequences.
519 Returns : Reference to two element array containing lower and upper
520 bounds of molecule molecular weight. For DNA and RNA
521 sequences single-stranded weights are returned. If
522 sequence contains no ambiguous elements, both entries in
523 array are equal to molecular weight of molecule.
524 Args : None or reference to sequence object
525 Throws : Exception if type of sequence is unknown (ie not amino or
526 nucleic) or if unknown letter in alphabet. Ambiguous
527 elements are allowed.
535 my $_is_instance = 1 ;
537 my $object_argument = shift @_;
538 my ($weight_array, $rcount);
540 if (defined $object_argument) {
545 if ($weight_array = $self->{'_mol_wt'}) {
546 # return mol. weight if previously calculated
547 return $weight_array;
549 $seqobj = $self->{'_seqref'};
550 $rcount = $self->count_monomers();
552 $seqobj = $object_argument;
553 $seqobj->isa("Bio::PrimarySeqI") ||
554 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
555 $_is_strict = _is_alphabet_strict
($seqobj); # is alphabet OK?
556 $rcount = $self->count_monomers($seqobj);
559 # We will also need to know what type of monomer we are dealing with
560 my $moltype = $seqobj->alphabet();
562 # In general,the molecular weight is bounded below by the sum of the
563 # weights of lower bounds of each alphabet symbol times the number of
564 # occurrences of the symbol in the sequence. A similar upper bound on
565 # the weight is also calculated.
567 # Note that for "strict" (i.e. unambiguous) sequences there is an
568 # inefficiency since the upper bound = the lower bound and there are
569 # two calculations. However, this decrease in performance will be
570 # minor and leads to significantly more readable code.
572 my $weight_lower_bound = 0;
573 my $weight_upper_bound = 0;
574 my $weight_table = $Weights{$moltype};
577 # compute weight of all the residues
578 foreach $element (keys %$rcount) {
579 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
580 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
582 # this tracks only the residues used for counting MW
583 $total_res += $$rcount{$element};
585 if ($moltype =~ /protein/) {
586 # remove H2O during peptide bond formation.
587 $weight_lower_bound -= $water * ($total_res - 1);
588 $weight_upper_bound -= $water * ($total_res - 1);
590 # Correction because phosphate of 5' residue has additional OH and
591 # sugar ring of 3' residue has additional H
592 $weight_lower_bound += $water;
593 $weight_upper_bound += $water;
596 $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
597 $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
599 $weight_array = [$weight_lower_bound, $weight_upper_bound];
602 $self->{'_mol_wt'} = $weight_array; # Save in case called again later
604 return $weight_array;
611 Usage : $rcount = $seqstats->count_codons() or
612 $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
613 Function: Counts the number of each type of codons for a dna or rna
614 sequence, starting at the 1st triple of the input sequence.
616 Returns : Reference to a hash in which keys are codons of the genetic
617 alphabet used and values are number of occurrences of the
618 codons in the sequence. All codons with "ambiguous" bases
619 are counted together.
620 Args : None or sequence object
621 Throws : an exception if type of sequence is unknown or protein.
631 my $_is_instance = 1 ;
633 my $object_argument = shift @_;
635 if (defined $object_argument) {
640 if ($rcount = $self->{'_codon_count'}) {
641 return $rcount; # return count if previously calculated
643 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
644 $seqobj = $self->{'_seqref'};
646 $seqobj = $object_argument;
647 $seqobj->isa("Bio::PrimarySeqI") ||
648 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
649 $_is_strict = _is_alphabet_strict
($seqobj);
652 # Codon counts only make sense for nucleic acid sequences
653 my $alphabet = $seqobj->alphabet();
655 unless ($alphabet =~ /[dr]na/i) {
656 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
657 "not for $alphabet sequences.");
660 # If sequence contains ambiguous bases, warn that codons
661 # containing them will all be lumped together in the count.
664 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
665 " All codons with ambiguous bases will be added together in count.")
666 if $self->verbose >= 0 ;
669 my $seq = $seqobj->seq();
671 # Now step through the string by threes and count the codons
674 while (length($seq) > 2) {
675 $codon = uc substr($seq,0,3);
676 $seq = substr($seq,3);
677 if ($codon =~ /[^ACTGU]/i) {
678 $$rcount{'ambiguous'}++; #lump together ambiguous codons
681 if (!defined $$rcount{$codon}) {
682 $$rcount{$codon}= 1 ;
685 $$rcount{$codon}++; # default
689 $self->{'_codon_count'} = $rcount; # Save in case called again later
696 =head2 hydropathicity
698 Title : hydropathicity
699 Usage : $gravy = $seqstats->hydropathicity(); or
700 $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
702 Function: Calculates the mean Kyte-Doolittle hydropathicity for a
703 protein sequence. Also known as the "gravy" score. Refer to
704 Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
707 Args : None or reference to sequence object
709 Throws : an exception if type of sequence is not protein.
717 my $_is_instance = 1 ;
719 my $object_argument = shift @_;
721 if (defined $object_argument) {
726 if (my $gravy = $self->{'_hydropathicity'}) {
727 return $gravy; # return value if previously calculated
729 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
730 $seqobj = $self->{'_seqref'};
732 $seqobj = $object_argument;
733 $seqobj->isa("Bio::PrimarySeqI") ||
734 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
735 $_is_strict = _is_alphabet_strict
($seqobj);
738 # hydropathicity not menaingful for empty sequences
739 unless ($seqobj->length() > 0) {
740 $seqobj->throw("hydropathicity not defined for zero-length sequences");
743 # hydropathicity only make sense for protein sequences
744 my $alphabet = $seqobj->alphabet();
746 unless ($alphabet =~ /protein/i) {
747 $seqobj->throw("hydropathicity only meaningful for protein, ".
748 "not for $alphabet sequences.");
751 # If sequence contains ambiguous bases, warn that codons
752 # containing them will all be lumped together in the count.
754 unless ($_is_strict ) {
755 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
756 "Hydropathicity can not be caculated.")
759 my $seq = $seqobj->seq();
761 # Now step through the string and add up the hydropathicity values
764 for my $i ( 0 .. length($seq) ) {
765 my $codon = uc(substr($seq,$i,1));
766 $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
768 $gravy /= length($seq);
772 $self->{'_hydropathicity'} = $gravy; # Save in case called again later
779 =head2 _is_alphabet_strict
781 Title : _is_alphabet_strict
783 Function: internal function to determine whether there are
784 any ambiguous elements in the current sequence
786 Returns : 1 if strict alphabet is being used,
787 0 if ambiguous elements are present
790 Throws : an exception if type of sequence is unknown (ie amino or
791 nucleic) or if unknown letter in alphabet. Ambiguous
792 monomers are allowed.
796 sub _is_alphabet_strict
{
799 my $moltype = $seqobj->alphabet();
801 # convert everything to upper case to be safe
802 my $seqstring = uc $seqobj->seq();
804 # Since T is used in RichSeq RNA sequences, do conversion locally
805 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
807 # First we check if only the 'strict' letters are present in the
808 # sequence string If not, we check whether the remaining letters
809 # are ambiguous monomers or whether there are illegal letters in
812 # $alpha_array is a ref to an array of the 'strictly' allowed letters
813 my $alpha_array = $Alphabets_strict{$moltype} ;
815 # $alphabet contains the allowed letters in string form
816 my $alphabet = join ('', @
$alpha_array) ;
817 unless ($seqstring =~ /[^$alphabet]/) {
821 # Next try to match with the alphabet's ambiguous letters
822 $alpha_array = $Alphabets{$moltype} ;
823 $alphabet = join ('', @
$alpha_array) ;
825 unless ($seqstring =~ /[^$alphabet]/) {
829 # If we got here there is an illegal letter in the sequence
830 $seqobj->throw("Alphabet not OK for $seqobj");
836 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
837 Function: Displays dna / rna parameters (used for debugging)
847 print "\n adenine = : $adenine \n";
848 print "\n guanine = : $guanine \n";
849 print "\n cytosine = : $cytosine \n";
850 print "\n thymine = : $thymine \n";
851 print "\n uracil = : $uracil \n";
853 print "\n dna_A_wt = : $dna_A_wt \n";
854 print "\n dna_C_wt = : $dna_C_wt \n";
855 print "\n dna_G_wt = : $dna_G_wt \n";
856 print "\n dna_T_wt = : $dna_T_wt \n";
858 print "\n rna_A_wt = : $rna_A_wt \n";
859 print "\n rna_C_wt = : $rna_C_wt \n";
860 print "\n rna_G_wt = : $rna_G_wt \n";
861 print "\n rna_U_wt = : $rna_U_wt \n";