2 # BioPerl module for Bio::Tools::SeqStats
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Copyright Peter Schattner
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::SeqStats - Object holding statistics for one
21 # build a primary nucleic acid or protein sequence object somehow
22 # then build a statistics object from the sequence object
24 $seqobj = Bio::PrimarySeq->new(-seq => 'ACTGTGGCGTCAACTG',
27 $seq_stats = Bio::Tools::SeqStats->new(-seq => $seqobj);
29 # obtain a hash of counts of each type of monomer
30 # (i.e. amino or nucleic acid)
31 print "\nMonomer counts using statistics object\n";
32 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
33 $hash_ref = $seq_stats->count_monomers(); # e.g. for DNA sequence
34 foreach $base (sort keys %$hash_ref) {
35 print "Number of bases of type ", $base, "= ",
36 %$hash_ref->{$base},"\n";
39 # obtain the count directly without creating a new statistics object
40 print "\nMonomer counts without statistics object\n";
41 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
42 foreach $base (sort keys %$hash_ref) {
43 print "Number of bases of type ", $base, "= ",
44 %$hash_ref->{$base},"\n";
48 # obtain hash of counts of each type of codon in a nucleic acid sequence
49 print "\nCodon counts using statistics object\n";
50 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
51 foreach $base (sort keys %$hash_ref) {
52 print "Number of codons of type ", $base, "= ",
53 %$hash_ref->{$base},"\n";
57 print "\nCodon counts without statistics object\n";
58 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
59 foreach $base (sort keys %$hash_ref) {
60 print "Number of codons of type ", $base, "= ",
61 %$hash_ref->{$base},"\n";
64 # Obtain the molecular weight of a sequence. Since the sequence
65 # may contain ambiguous monomers, the molecular weight is returned
66 # as a (reference to) a two element array containing greatest lower
67 # bound (GLB) and least upper bound (LUB) of the molecular weight
68 $weight = $seq_stats->get_mol_wt();
69 print "\nMolecular weight (using statistics object) of sequence ",
70 $seqobj->id(), " is between ", $$weight[0], " and " ,
74 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
75 print "\nMolecular weight (without statistics object) of sequence ",
76 $seqobj->id(), " is between ", $$weight[0], " and " ,
79 # Calculate mean Kyte-Doolittle hydropathicity (aka "gravy" score)
80 my $prot = Bio::PrimarySeq->new(-seq=>'MSFVLVAPDMLATAAADVVQIGSAVSAGS',
81 -alphabet=>'protein');
82 my $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
83 print "might be hydropathic" if $gravy > 1;
87 Bio::Tools::SeqStats is a lightweight object for the calculation of
88 simple statistical and numerical properties of a sequence. By
89 "lightweight" I mean that only "primary" sequences are handled by the
90 object. The calling script needs to create the appropriate primary
91 sequence to be passed to SeqStats if statistics on a sequence feature
92 are required. Similarly if a codon count is desired for a
93 frame-shifted sequence and/or a negative strand sequence, the calling
94 script needs to create that sequence and pass it to the SeqStats
97 Nota that nucleotide sequences in bioperl do not strictly separate RNA
98 and DNA sequences. By convention, sequences from RNA molecules are
99 shown as is they were DNA. Objects are supposed to make the
100 distinction when needed. This class is one of the few where this
101 distinctions needs to be made. Internally, it changes all Ts into Us
102 before weight and monomer count.
104 SeqStats can be called in two distinct manners. If only a single
105 computation is required on a given sequence object, the method can be
106 called easily using the SeqStats object directly:
108 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
110 Alternately, if several computations will be required on a given
111 sequence object, an "instance" statistics object can be constructed
112 and used for the method calls:
114 $seq_stats = Bio::Tools::SeqStats->new($seqobj);
115 $monomers = $seq_stats->count_monomers();
116 $codons = $seq_stats->count_codons();
117 $weight = $seq_stats->get_mol_wt();
118 $gravy = $seq_stats->hydropathicity();
120 As currently implemented the object can return the following values
127 The molecular weight of the sequence: get_mol_wt()
131 The number of each type of monomer present: count_monomers()
135 The number of each codon present in a nucleic acid sequence:
140 The mean hydropathicity ("gravy" score) of a protein:
145 For DNA and RNA sequences single-stranded weights are returned. The
146 molecular weights are calculated for neutral, or not ionized,
147 nucleic acids. The returned weight is the sum of the
148 base-sugar-phosphate residues of the chain plus one weight of water to
149 to account for the additional OH on the phosphate of the 5' residue
150 and the additional H on the sugar ring of the 3' residue. Note that
151 this leads to a difference of 18 in calculated molecular weights
152 compared to some other available programs (e.g. Informax VectorNTI).
154 Note that since sequences may contain ambiguous monomers (e.g. "M",
155 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
156 returns a two-element array containing the greatest lower bound and
157 least upper bound of the molecule. For a sequence with no ambiguous
158 monomers, the two elements of the returned array will be equal. The
159 method count_codons() handles ambiguous bases by simply counting all
160 ambiguous codons together and issuing a warning to that effect.
163 =head1 DEVELOPERS NOTES
165 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
167 Heikki made tiny adjustments (+/- 0.01 daltons) to amino acid
168 molecular weights to have the output match values in SWISS-PROT.
170 Torsten added hydropathicity calculation.
176 User feedback is an integral part of the evolution of this and other
177 Bioperl modules. Send your comments and suggestions preferably to one
178 of the Bioperl mailing lists. Your participation is much appreciated.
180 bioperl-l@bioperl.org - General discussion
181 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
185 Please direct usage questions or support issues to the mailing list:
187 I<bioperl-l@bioperl.org>
189 rather than to the module maintainer directly. Many experienced and
190 reponsive experts will be able look at the problem and quickly
191 address it. Please include a thorough description of the problem
192 with code and data examples if at all possible.
194 =head2 Reporting Bugs
196 Report bugs to the Bioperl bug tracking system to help us keep track
197 the bugs and their resolution. Bug reports can be submitted the web:
199 https://github.com/bioperl/bioperl-live/issues
201 =head1 AUTHOR - Peter Schattner
203 Email schattner AT alum.mit.edu
205 =head1 CONTRIBUTOR - Torsten Seemann
207 Email torsten.seemann AT infotech.monash.edu.au
211 The rest of the documentation details each of the object
212 methods. Internal methods are usually preceded with a _
217 package Bio
::Tools
::SeqStats
;
219 use vars
qw(%Alphabets %Alphabets_strict $amino_weights
220 $rna_weights $dna_weights %Weights $amino_hydropathicity);
222 use base qw(Bio::Root::Root);
226 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
227 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
228 'protein' => [ qw(A R N D C Q E G H I L K M F U
229 P S T W X Y V B Z J O *) ], # sac: added B, Z
232 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
233 %Alphabets_strict = (
234 'dna' => [ qw( A C G T ) ],
235 'rna' => [ qw( A C G U ) ],
236 'protein' => [ qw(A R N D C Q E G H I L K M F U
241 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
242 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
244 # Amino Acid alphabet
246 # ------------------------------------------
248 # ------------------------------------------
250 my $amino_A_wt = 89.09;
251 my $amino_C_wt = 121.15;
252 my $amino_D_wt = 133.1;
253 my $amino_E_wt = 147.13;
254 my $amino_F_wt = 165.19;
255 my $amino_G_wt = 75.07;
256 my $amino_H_wt = 155.16;
257 my $amino_I_wt = 131.17;
258 my $amino_K_wt = 146.19;
259 my $amino_L_wt = 131.17;
260 my $amino_M_wt = 149.21;
261 my $amino_N_wt = 132.12;
262 my $amino_O_wt = 255.31;
263 my $amino_P_wt = 115.13;
264 my $amino_Q_wt = 146.15;
265 my $amino_R_wt = 174.20;
266 my $amino_S_wt = 105.09;
267 my $amino_T_wt = 119.12;
268 my $amino_U_wt = 168.06;
269 my $amino_V_wt = 117.15;
270 my $amino_W_wt = 204.23;
271 my $amino_Y_wt = 181.19;
275 'A' => [$amino_A_wt, $amino_A_wt], # Alanine
276 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
277 'C' => [$amino_C_wt, $amino_C_wt], # Cysteine
278 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
279 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
280 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
281 'G' => [$amino_G_wt, $amino_G_wt], # Glycine
282 'H' => [$amino_H_wt, $amino_H_wt], # Histidine
283 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
284 'J' => [$amino_L_wt, $amino_I_wt], # Leucine, Isoleucine
285 'K' => [$amino_K_wt, $amino_K_wt], # Lysine
286 'L' => [$amino_L_wt, $amino_L_wt], # Leucine
287 'M' => [$amino_M_wt, $amino_M_wt], # Methionine
288 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
289 'O' => [$amino_O_wt, $amino_O_wt], # Pyrrolysine
290 'P' => [$amino_P_wt, $amino_P_wt], # Proline
291 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
292 'R' => [$amino_R_wt, $amino_R_wt], # Arginine
293 'S' => [$amino_S_wt, $amino_S_wt], # Serine
294 'T' => [$amino_T_wt, $amino_T_wt], # Threonine
295 'U' => [$amino_U_wt, $amino_U_wt], # SelenoCysteine
296 'V' => [$amino_V_wt, $amino_V_wt], # Valine
297 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
298 'X' => [$amino_G_wt, $amino_W_wt], # Unknown
299 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
300 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
303 # Extended Dna / Rna alphabet
304 use vars
( qw($C $O $N $H $P $water) );
305 use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
306 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
307 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
308 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
309 use vars ( qw($dna_weights $rna_weights %Weights));
318 $adenine = 5 * $C + 5 * $N + 5 * $H;
319 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
320 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
321 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
322 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
324 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P;
325 # neutral (unionized) form
326 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
328 # the following are single strand molecular weights / base
329 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
330 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
331 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
332 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
334 $rna_A_wt = $adenine + $ribose_phosphate - $water;
335 $rna_C_wt = $cytosine + $ribose_phosphate - $water;
336 $rna_G_wt = $guanine + $ribose_phosphate - $water;
337 $rna_U_wt = $uracil + $ribose_phosphate - $water;
340 'A' => [$dna_A_wt,$dna_A_wt], # Adenine
341 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
342 'G' => [$dna_G_wt,$dna_G_wt], # Guanine
343 'T' => [$dna_T_wt,$dna_T_wt], # Thymine
344 'M' => [$dna_C_wt,$dna_A_wt], # A or C
345 'R' => [$dna_A_wt,$dna_G_wt], # A or G
346 'W' => [$dna_T_wt,$dna_A_wt], # A or T
347 'S' => [$dna_C_wt,$dna_G_wt], # C or G
348 'Y' => [$dna_C_wt,$dna_T_wt], # C or T
349 'K' => [$dna_T_wt,$dna_G_wt], # G or T
350 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
351 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
352 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
353 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
354 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
355 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
359 'A' => [$rna_A_wt,$rna_A_wt], # Adenine
360 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
361 'G' => [$rna_G_wt,$rna_G_wt], # Guanine
362 'U' => [$rna_U_wt,$rna_U_wt], # Uracil
363 'M' => [$rna_C_wt,$rna_A_wt], # A or C
364 'R' => [$rna_A_wt,$rna_G_wt], # A or G
365 'W' => [$rna_U_wt,$rna_A_wt], # A or U
366 'S' => [$rna_C_wt,$rna_G_wt], # C or G
367 'Y' => [$rna_C_wt,$rna_U_wt], # C or U
368 'K' => [$rna_U_wt,$rna_G_wt], # G or U
369 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
370 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
371 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
372 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
373 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
374 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
378 'dna' => $dna_weights,
379 'rna' => $rna_weights,
380 'protein' => $amino_weights,
383 # Amino acid scale: Hydropathicity.
384 # Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132(1982).
385 # http://au.expasy.org/tools/pscale/Hphob.Doolittle.html
387 $amino_hydropathicity = {
413 my($class,@args) = @_;
414 my $self = $class->SUPER::new(@args);
416 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
417 unless ($seqobj->isa("Bio::PrimarySeqI")) {
418 $self->throw("SeqStats works only on PrimarySeqI objects");
420 if ( !defined $seqobj->alphabet ||
421 !defined $Alphabets{$seqobj->alphabet}) {
422 $self->throw("Must have a valid alphabet defined for seq (".
423 join(",",keys %Alphabets));
425 $self->{'_seqref'} = $seqobj;
426 # check the letters in the sequence
427 $self->{'_is_strict'} = _is_alphabet_strict
($seqobj);
431 =head2 count_monomers
433 Title : count_monomers
434 Usage : $rcount = $seq_stats->count_monomers();
435 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
436 Function: Counts the number of each type of monomer (amino acid or
437 base) in the sequence.
438 Ts are counted as Us in RNA sequences.
440 Returns : Reference to a hash in which keys are letters of the
441 genetic alphabet used and values are number of occurrences
442 of the letter in the sequence.
443 Args : None or reference to sequence object
444 Throws : Throws an exception if type of sequence is unknown (ie amino
445 or nucleic)or if unknown letter in alphabet. Ambiguous
446 elements are allowed.
455 my $_is_instance = 1 ;
457 my $object_argument = shift @_;
459 # First we need to determine if the present object is an instance
460 # object or if the sequence object has been passed as an argument
462 if (defined $object_argument) {
466 # If we are using an instance object...
468 if ($self->{'_monomer_count'}) {
469 return $self->{'_monomer_count'}; # return count if previously calculated
471 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
472 $seqobj = $self->{'_seqref'};
475 $seqobj = $object_argument;
477 # Following two lines lead to error in "throw" routine
478 $seqobj->isa("Bio::PrimarySeqI") ||
479 $self->throw("SeqStats works only on PrimarySeqI objects");
480 # is alphabet OK? Is it strict?
481 $_is_strict = _is_alphabet_strict
($seqobj);
484 my $alphabet = $_is_strict ?
$Alphabets_strict{$seqobj->alphabet} :
485 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters
487 # convert everything to upper case to be safe
488 my $seqstring = uc $seqobj->seq();
490 # Since T is used in RichSeq RNA sequences, do conversion locally
491 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
493 # For each letter, count the number of times it appears in
496 foreach $element (@
$alphabet) {
497 # skip terminator symbol which may confuse regex
498 next LETTER
if $element eq '*';
499 $count{$element} = ( $seqstring =~ s/$element/$element/g);
503 $self->{'_monomer_count'} = \
%count; # Save in case called again later
512 Usage : $wt = $seqobj->get_mol_wt() or
513 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
514 Function: Calculate molecular weight of sequence
515 Ts are counted as Us in RNA sequences.
518 Returns : Reference to two element array containing lower and upper
519 bounds of molecule molecular weight. For DNA and RNA
520 sequences single-stranded weights are returned. If
521 sequence contains no ambiguous elements, both entries in
522 array are equal to molecular weight of molecule.
523 Args : None or reference to sequence object
524 Throws : Exception if type of sequence is unknown (ie not amino or
525 nucleic) or if unknown letter in alphabet. Ambiguous
526 elements are allowed.
534 my $_is_instance = 1 ;
536 my $object_argument = shift @_;
537 my ($weight_array, $rcount);
539 if (defined $object_argument) {
544 if ($weight_array = $self->{'_mol_wt'}) {
545 # return mol. weight if previously calculated
546 return $weight_array;
548 $seqobj = $self->{'_seqref'};
549 $rcount = $self->count_monomers();
551 $seqobj = $object_argument;
552 $seqobj->isa("Bio::PrimarySeqI") ||
553 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
554 $_is_strict = _is_alphabet_strict
($seqobj); # is alphabet OK?
555 $rcount = $self->count_monomers($seqobj);
558 # We will also need to know what type of monomer we are dealing with
559 my $moltype = $seqobj->alphabet();
561 # In general,the molecular weight is bounded below by the sum of the
562 # weights of lower bounds of each alphabet symbol times the number of
563 # occurrences of the symbol in the sequence. A similar upper bound on
564 # the weight is also calculated.
566 # Note that for "strict" (i.e. unambiguous) sequences there is an
567 # inefficiency since the upper bound = the lower bound and there are
568 # two calculations. However, this decrease in performance will be
569 # minor and leads to significantly more readable code.
571 my $weight_lower_bound = 0;
572 my $weight_upper_bound = 0;
573 my $weight_table = $Weights{$moltype};
576 # compute weight of all the residues
577 foreach $element (keys %$rcount) {
578 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
579 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
581 # this tracks only the residues used for counting MW
582 $total_res += $$rcount{$element};
584 if ($moltype =~ /protein/) {
585 # remove H2O during peptide bond formation.
586 $weight_lower_bound -= $water * ($total_res - 1);
587 $weight_upper_bound -= $water * ($total_res - 1);
589 # Correction because phosphate of 5' residue has additional OH and
590 # sugar ring of 3' residue has additional H
591 $weight_lower_bound += $water;
592 $weight_upper_bound += $water;
595 $weight_lower_bound = sprintf("%.1f", $weight_lower_bound);
596 $weight_upper_bound = sprintf("%.1f", $weight_upper_bound);
598 $weight_array = [$weight_lower_bound, $weight_upper_bound];
601 $self->{'_mol_wt'} = $weight_array; # Save in case called again later
603 return $weight_array;
610 Usage : $rcount = $seqstats->count_codons() or
611 $rcount = Bio::Tools::SeqStats->count_codons($seqobj)
612 Function: Counts the number of each type of codons for a dna or rna
613 sequence, starting at the 1st triple of the input sequence.
615 Returns : Reference to a hash in which keys are codons of the genetic
616 alphabet used and values are number of occurrences of the
617 codons in the sequence. All codons with "ambiguous" bases
618 are counted together.
619 Args : None or sequence object
620 Throws : an exception if type of sequence is unknown or protein.
630 my $_is_instance = 1 ;
632 my $object_argument = shift @_;
634 if (defined $object_argument) {
639 if ($rcount = $self->{'_codon_count'}) {
640 return $rcount; # return count if previously calculated
642 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
643 $seqobj = $self->{'_seqref'};
645 $seqobj = $object_argument;
646 $seqobj->isa("Bio::PrimarySeqI") ||
647 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
648 $_is_strict = _is_alphabet_strict
($seqobj);
651 # Codon counts only make sense for nucleic acid sequences
652 my $alphabet = $seqobj->alphabet();
654 unless ($alphabet =~ /[dr]na/i) {
655 $seqobj->throw("Codon counts only meaningful for dna or rna, ".
656 "not for $alphabet sequences.");
659 # If sequence contains ambiguous bases, warn that codons
660 # containing them will all be lumped together in the count.
663 $seqobj->warn("Sequence $seqobj contains ambiguous bases.".
664 " All codons with ambiguous bases will be added together in count.")
665 if $self->verbose >= 0 ;
668 my $seq = $seqobj->seq();
670 # Now step through the string by threes and count the codons
673 while (length($seq) > 2) {
674 $codon = uc substr($seq,0,3);
675 $seq = substr($seq,3);
676 if ($codon =~ /[^ACTGU]/i) {
677 $$rcount{'ambiguous'}++; #lump together ambiguous codons
680 if (!defined $$rcount{$codon}) {
681 $$rcount{$codon}= 1 ;
684 $$rcount{$codon}++; # default
688 $self->{'_codon_count'} = $rcount; # Save in case called again later
695 =head2 hydropathicity
697 Title : hydropathicity
698 Usage : $gravy = $seqstats->hydropathicity(); or
699 $gravy = Bio::Tools::SeqStats->hydropathicity($seqobj);
701 Function: Calculates the mean Kyte-Doolittle hydropathicity for a
702 protein sequence. Also known as the "gravy" score. Refer to
703 Kyte J., Doolittle R.F., J. Mol. Biol. 157:105-132(1982).
706 Args : None or reference to sequence object
708 Throws : an exception if type of sequence is not protein.
716 my $_is_instance = 1 ;
718 my $object_argument = shift @_;
720 if (defined $object_argument) {
725 if (my $gravy = $self->{'_hydropathicity'}) {
726 return $gravy; # return value if previously calculated
728 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
729 $seqobj = $self->{'_seqref'};
731 $seqobj = $object_argument;
732 $seqobj->isa("Bio::PrimarySeqI") ||
733 $self->throw("Error: SeqStats works only on PrimarySeqI objects");
734 $_is_strict = _is_alphabet_strict
($seqobj);
737 # hydropathicity not menaingful for empty sequences
738 unless ($seqobj->length() > 0) {
739 $seqobj->throw("hydropathicity not defined for zero-length sequences");
742 # hydropathicity only make sense for protein sequences
743 my $alphabet = $seqobj->alphabet();
745 unless ($alphabet =~ /protein/i) {
746 $seqobj->throw("hydropathicity only meaningful for protein, ".
747 "not for $alphabet sequences.");
750 # If sequence contains ambiguous bases, warn that codons
751 # containing them will all be lumped together in the count.
753 unless ($_is_strict ) {
754 $seqobj->throw("Sequence $seqobj contains ambiguous amino acids. ".
755 "Hydropathicity can not be caculated.")
758 my $seq = $seqobj->seq();
760 # Now step through the string and add up the hydropathicity values
763 for my $i ( 0 .. length($seq) ) {
764 my $codon = uc(substr($seq,$i,1));
765 $gravy += $amino_hydropathicity->{$codon}||0; # table look-up
767 $gravy /= length($seq);
771 $self->{'_hydropathicity'} = $gravy; # Save in case called again later
778 =head2 _is_alphabet_strict
780 Title : _is_alphabet_strict
782 Function: internal function to determine whether there are
783 any ambiguous elements in the current sequence
785 Returns : 1 if strict alphabet is being used,
786 0 if ambiguous elements are present
789 Throws : an exception if type of sequence is unknown (ie amino or
790 nucleic) or if unknown letter in alphabet. Ambiguous
791 monomers are allowed.
795 sub _is_alphabet_strict
{
798 my $moltype = $seqobj->alphabet();
800 # convert everything to upper case to be safe
801 my $seqstring = uc $seqobj->seq();
803 # Since T is used in RichSeq RNA sequences, do conversion locally
804 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
806 # First we check if only the 'strict' letters are present in the
807 # sequence string If not, we check whether the remaining letters
808 # are ambiguous monomers or whether there are illegal letters in
811 # $alpha_array is a ref to an array of the 'strictly' allowed letters
812 my $alpha_array = $Alphabets_strict{$moltype} ;
814 # $alphabet contains the allowed letters in string form
815 my $alphabet = join ('', @
$alpha_array) ;
816 unless ($seqstring =~ /[^$alphabet]/) {
820 # Next try to match with the alphabet's ambiguous letters
821 $alpha_array = $Alphabets{$moltype} ;
822 $alphabet = join ('', @
$alpha_array) ;
824 unless ($seqstring =~ /[^$alphabet]/) {
828 # If we got here there is an illegal letter in the sequence
829 $seqobj->throw("Alphabet not OK for $seqobj");
835 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
836 Function: Displays dna / rna parameters (used for debugging)
846 print "\n adenine = : $adenine \n";
847 print "\n guanine = : $guanine \n";
848 print "\n cytosine = : $cytosine \n";
849 print "\n thymine = : $thymine \n";
850 print "\n uracil = : $uracil \n";
852 print "\n dna_A_wt = : $dna_A_wt \n";
853 print "\n dna_C_wt = : $dna_C_wt \n";
854 print "\n dna_G_wt = : $dna_G_wt \n";
855 print "\n dna_T_wt = : $dna_T_wt \n";
857 print "\n rna_A_wt = : $rna_A_wt \n";
858 print "\n rna_C_wt = : $rna_C_wt \n";
859 print "\n rna_G_wt = : $rna_G_wt \n";
860 print "\n rna_U_wt = : $rna_U_wt \n";