Merge pull request #5191 from solgenomics/topic/quality_control
[sgn.git] / lib / Bio / SecreTary / AAComposition.pm
blobe611d59f4f4d5861ed88473b226e03cd4301982b
1 package Bio::SecreTary::AAComposition;
2 use strict;
3 use warnings;
5 # this module just has some routines to calculate some
6 # simple parameters characterizing the amino acid composition
7 # of a peptide, such as aliphatic index, gravy index, number of
8 # DRQPEN, number of GASDRQPEN, number of Nitrogen or Oxygen atoms ...
9 # Molecular weight.
11 my %total_beta_strand_values = (A => 0.369, # Ala
12 C => 0.539, # Cys
13 D => 0.057, # Asp
14 E => 0.149, # Glu
15 F => 0.603, # Phe
16 G => 0.149, # Gly
17 H => 0.376, # His
18 I => 1.000, # Ile
19 K => 0.213, # Lys
20 L => 0.638, # Leu
21 M => 0.560, # Met
22 N => 0.142, # Asn
23 P => 0.000, # Pro
24 Q => 0.390, # Gln
25 R => 0.376, # Arg
26 S => 0.298, # Ser
27 T => 0.511, # Thr
28 V => 1.000, # Val
29 W => 0.809, # Trp
30 Y => 0.801, # Tyr
31 X => 0.449 # unknown
34 my %beta_sheet_values = (A => 0.220, # Ala
35 C => 0.520, # Cys
36 D => 0.116, # Asp
37 E => 0.132, # Glu
38 F => 0.645, # Phe
39 G => 0.188, # Gly
40 H => 0.316, # His
41 I => 0.897, # Ile
42 K => 0.228, # Lys
43 L => 0.563, # Leu
44 M => 0.531, # Met
45 N => 0.155, # Asn
46 P => 0.000, # Pro
47 Q => 0.302, # Gln
48 R => 0.351, # Arg
49 S => 0.356, # Ser
50 T => 0.538, # Thr
51 V => 1.000, # Val
52 W => 0.591, # Trp
53 Y => 0.566, # Tyr
54 X => 0.411 # unknown
57 # Deleage & Roux ... beta turn
58 my %beta_turn_values = (A => 0.338, # Ala
59 C => 0.448, # Cys
60 D => 0.591, # Asp
61 E => 0.561, # Glu
62 F => 0.237, # Phe
63 G => 1.000, # Gly
64 H => 0.451, # His
65 I => 0.000, # Ile
66 K => 0.656, # Lys
67 L => 0.265, # Leu
68 M => 0.121, # Met
69 N => 0.822, # Asn
70 P => 0.725, # Pro
71 Q => 0.467, # Gln
72 R => 0.415, # Arg
73 S => 0.664, # Ser
74 T => 0.308, # Thr
75 V => 0.091, # Val
76 W => 0.189, # Trp
77 Y => 0.343, # Tyr
78 X => 0.435 # unknown
82 sub aa_frequencies
83 { # given a aa sequence and a hash of aa/count pairs, add the counts of the aa's in the sequence to the hash.
84 my $sequence = shift;
85 my $AAFhash = shift
86 ; # ref. to hash with AA chars for keys, number of occurences for values
88 my %AAFs = ();
90 #ref to array of 20 amino acids (1 char abbreviations), plus "X" for any other stuff.
91 if ( !defined $AAFhash ) {
92 $AAFhash = \%AAFs;
93 my @aas = (
94 "A", "C", "D", "E", "F", "G", "H", "I", "K", "L",
95 "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"
98 map { $AAFhash->{$_} = 0; } @aas;
100 my $seq_length_a = length $sequence;
101 my $seq_length_b = 0;
102 while ($sequence) {
103 my $AAchar = chop $sequence;
104 if ( exists $AAFhash->{$AAchar} ) {
105 $AAFhash->{$AAchar}++;
107 else {
108 $AAFhash->{"X"}++;
110 $seq_length_b++;
113 "in aa_frequencies, seqlength discrepancy: $seq_length_a, $seq_length_b \n"
114 unless ( $seq_length_a == $seq_length_b );
115 $AAFhash->{"LENGTH"} = $seq_length_b;
116 return $AAFhash;
119 =head2 function aliphatic_index
121 Synopsis: aliphatic_index($sequence);
122 Description: Calculates and returns the aliphatic index of the sequence.
123 (See http://expasy.org/tools/protparam-doc.html for definition of aliphatic index.)
126 =cut
128 sub aliphatic_index {
129 my $sequence = shift;
130 my $nA = ( $sequence =~ tr/A// );
131 my $nV = ( $sequence =~ tr/V// );
132 my $nL = ( $sequence =~ tr/L// );
133 my $nI = ( $sequence =~ tr/I// );
134 my $nX = ( $sequence =~ tr/X// );
135 my $L = length($sequence) - $nX;
136 if ( $L <= 0 ) {
137 warn "In aliphatic_index. ", length $sequence, " $nX $sequence \n";
139 else {
140 return 100.0 * ( 1.0 * $nA + 2.9 * $nV + 3.9 * ( $nL + $nI ) ) / $L;
142 return;
145 =head2 function gravy
147 Synopsis: gravy($sequence);
148 Description: Calculates
149 and returns the "gravy" index of the sequence.
150 (See http://expasy.org/tools/protparam-doc.html for definition of
151 gravy index.)
153 =cut
155 sub gravy {
156 my $sequence = shift;
158 # Kyte and Doolittle hydropathy index: (from Wikipedia "Hydropathy index")
159 my %Hydropaths = (
160 "A" => 1.80,
161 "R" => -4.50,
162 "N" => -3.50,
163 "D" => -3.50,
164 "C" => 2.5,
165 "E" => -3.50,
166 "Q" => -3.50,
167 "G" => -0.40,
168 "H" => -3.20,
169 "I" => 4.50,
170 "L" => 3.80,
171 "K" => -3.90,
172 "M" => 1.90,
173 "F" => 2.80,
174 "P" => -1.60,
175 "S" => -0.80,
176 "T" => -0.70,
177 "W" => -0.90,
178 "Y" => -1.30,
179 "V" => 4.20
182 my $sum_h = 0;
183 my $count = 0;
185 while ($sequence) {
186 my $char = chop $sequence;
187 if ( defined $Hydropaths{$char} )
188 { # standard 20 aa's. Others (incl. X) not counted
189 $sum_h += $Hydropaths{$char};
190 $count++;
194 if ( $count > 0 ) {
195 return $sum_h / $count;
197 else {
198 return; # -10000.0;
202 =head2 function nDRQPEN
204 Synopsis: nDRQPEN($sequence);
205 Description: Calculates
206 and returns the number of amino acids in the sequence which are DRQPE or N.
208 =cut
210 sub nDRQPEN {
211 my $sequence = shift;
212 my $count = 0;
213 while ($sequence) {
214 my $c = chop $sequence;
215 $count++ if ( $c =~ /[DRQPEN]/ );
217 return $count;
220 =head2 function nGASDRQPEN
222 Synopsis: nGASDRQPEN($sequence);
223 Description: Calculates and returns the number of amino acids
224 in the sequence which are GASDRQPE or N.
226 =cut
228 sub nGASDRQPEN {
229 my $sequence = shift;
230 my $count = 0;
231 while ($sequence) {
232 my $c = chop $sequence;
233 $count++ if ( $c =~ /[GASDRQPEN]/ );
235 return $count;
238 sub total_beta_strand{
239 my $sequence = shift;
240 my $window_size = shift || 9;
241 my $normalize = shift || 1;
242 return window_average(\%total_beta_strand_values, $sequence, $window_size, $normalize);
245 sub beta_sheet{
246 my $sequence = shift;
247 my $window_size = shift || 9;
248 my $normalize = shift || 1;
249 return window_average(\%beta_sheet_values, $sequence, $window_size, $normalize);
252 sub beta_turn{
253 my $sequence = shift;
254 my $window_size = shift || 9;
255 my $normalize = shift || 1;
256 return window_average(\%beta_turn_values, $sequence, $window_size, $normalize);
259 sub window_average{
260 my $aavals = shift; # ref to hash giving the values associated with each aa
261 my $sequence = shift;
262 my $window_size = shift || 9;
263 my $normalize = shift || 1;
264 my $normalize_factor = ($normalize)? 1.0/$window_size: 1.0;
265 my $hws = int($window_size/2);
266 my @val_array = ( (0) x length $sequence);
267 my ($minval, $maxval) = (0, 0);
268 if(length $sequence >= $window_size){
269 my $val = 0;
270 foreach (0..$window_size-1){
271 $val += $aavals->{substr($sequence, $_, 1)};
273 ($minval, $maxval) = ($val, $val);
274 $val_array[$hws] = $val * $normalize_factor;
275 my $i = 0;
276 while($i + $window_size < length $sequence){
277 $val -= $aavals->{substr($sequence, $i, 1)};
278 $val += $aavals->{substr($sequence, $i + $window_size, 1)};
279 $minval = $val if($val < $minval);
280 $maxval = $val if($val > $maxval);
281 $i++;
282 $val_array[$i + $hws] = $val * $normalize_factor;
285 return (\@val_array, $minval*$normalize_factor, $maxval*$normalize_factor); # return reference to val array
290 =head2 function nVIL
292 Synopsis: nVIL($sequence);
293 Description: Calculates
294 and returns the number of amino acids in the sequence which are VI or L.
295 as well as total (not counting X)
296 =cut
298 sub nVIL {
299 my $sequence = shift;
300 my ($count, $count_vil) = (0, 0);
301 while ($sequence) {
302 my $c = chop $sequence;
303 $count++ if( $c =~ /[ACDEFGHIKLMNPQRSTVWY]/ );
304 $count_vil++ if ( $c =~ /[VIL]/ );
306 warn "count is 0. sequence: $sequence \n" if($count == 0);
307 return ($count, $count_vil);
312 =head2 function nNitrogen
314 Synopsis: nNitrogen($sequence);
315 Description: Calculates
316 and returns the number of Nitrogen atoms in this sequence.
318 =cut
320 sub nNitrogen {
321 my $sequence = shift;
322 my $count = 0;
323 while ($sequence) {
324 my $c = chop $sequence;
325 $count += _N_in_aa($c);
327 return $count;
330 =head2 function nOxygen
332 Synopsis: nOxygen($sequence);
333 Description: Calculates
334 and returns the number of Oxygen atoms in the sequence. If the
335 sequence which is analyzed includes a C-terminus then there would
336 be one more O than the result here, but we are counting O's
337 in a part of the protein at the N-terminus, so this function doesn't count
338 that extra C-terminal O.
340 =cut
342 sub nOxygen {
343 my $sequence = shift;
344 my $count = 0;
345 while ($sequence) {
346 my $c = chop $sequence;
347 $count += _O_in_aa($c);
349 return $count;
352 sub _N_in_aa { # number of Nitrogen atoms in each kind of amino acid
354 # (see e.g. Wikipedia "proteinogenic amino acids")
355 my $aa = shift;
356 my %Nhref = ( # these are the aa's with > 1 Nitrogen
357 "H" => 3,
358 "K" => 2,
359 "N" => 2,
360 "O" => 3,
361 "Q" => 2,
362 "R" => 4,
363 "W" => 2
365 if ( exists $Nhref{$aa} ) {
366 return $Nhref{$aa};
368 else {
369 return 1; # all others have 1 nitrogen.
373 sub _O_in_aa { # Number of Oxygen atoms in each kind of amino acid
375 # (see e.g. Wikipedia "proteinogenic amino acids")
376 # Only 1 Oxygen in COOH group counted.
377 my $aa = shift;
378 my %Ohref = (
379 "D" => 3,
380 "E", => 3,
381 "N" => 2,
382 "O" => 3, # pyrrolysine
383 "Q" => 2,
384 "S" => 2,
385 "T" => 2,
386 "Y" => 2
388 if ( exists $Ohref{$aa} ) {
389 return $Ohref{$aa};
391 else {
392 return 1;
396 sub molecular_weight {
397 my $AAFhash = shift;
398 my %MolWeights = (
399 # (see e.g. Wikipedia "proteinogenic amino acids")
400 "A" => 89.09,
401 "C" => 121.16,
402 "D" => 133.10,
403 "E" => 147.13,
404 "F" => 165.19,
405 "G" => 75.07,
406 "H" => 155.16,
407 "I" => 131.18,
408 "K" => 146.19,
409 "L" => 131.18,
410 "M" => 149.21,
411 "N" => 132.12,
412 "P" => 115.13,
413 "Q" => 146.15,
414 "R" => 174.20,
415 "S" => 105.09,
416 "T" => 119.12,
417 "V" => 117.15,
418 "W" => 204.23,
419 "Y" => 181.19,
424 # "A" => 89,
425 # "R" => 174,
426 # "N" => 132,
427 # "D" => 133,
428 # "C" => 121,
429 # "E" => 147,
430 # "Q" => 146,
431 # "G" => 75,
432 # "H" => 155,
433 # "I" => 131,
434 # "L" => 131,
435 # "K" => 146,
436 # "M" => 149,
437 # "F" => 165,
438 # "P" => 115,
439 # "S" => 105,
440 # "T" => 119,
441 # "W" => 204,
442 # "Y" => 181,
443 # "V" => 117
444 # );
446 my $sum_wf = 0;
447 my $sum_f = 0;
448 my $length = $AAFhash->{"LENGTH"};
449 foreach ( keys %$AAFhash ) {
450 next unless ( defined $MolWeights{$_} );
451 my $f = $AAFhash->{$_}; #frequency of this aa in this seq.
452 # print "$_, $w \n";
453 $sum_wf += $MolWeights{$_} * $f;
454 $sum_f += $f;
456 if ( $sum_f > 0 ) {
458 # if there are X chars, consider them to have MW equal to avg of others.
459 $sum_wf *= $AAFhash->{"LENGTH"} / $sum_f;
461 $sum_wf -= 18 * ( $length - 1 )
462 ; # subtract for the H2O molecules removed in forming peptide bonds.
463 return $sum_wf;
466 sub charge
467 { # get the electric charge at given pH of the set of aa's stored in aa_hash
468 # for pKa's (3.90, 4.07, etc.) see e.g. Wikipedia "proteinogenic amino acids"
469 # this does not include the charges of the terminal groups
470 my $aa_hash = shift
471 ; # ref to hash storing the numbers in the segment analyzed of each of the charged aa's (DECY & HKR)
472 my $pH = shift;
473 my $Q = 0;
474 $Q -= $aa_hash->{"D"} / ( 1.0 + 10**( 3.90 - $pH ) );
475 $Q -= $aa_hash->{"E"} / ( 1.0 + 10**( 4.07 - $pH ) );
476 $Q -= $aa_hash->{"C"} / ( 1.0 + 10**( 8.18 - $pH ) );
477 $Q -= $aa_hash->{"Y"} / ( 1.0 + 10**( 10.48 - $pH ) );
479 $Q += $aa_hash->{"H"} / ( 1.0 + 10**( $pH - 6.04 ) );
480 $Q += $aa_hash->{"K"} / ( 1.0 + 10**( $pH - 10.54 ) );
481 $Q += $aa_hash->{"R"} / ( 1.0 + 10**( $pH - 12.48 ) );
482 return $Q;
485 sub isoelectric_point
486 { # finds the isoelectric point pI, i.e. the pH at which the avg. net electric charge of the sequence is 0.
487 # it might be interesting to get the derivative of Q wrt pH also.
488 my $aa_hash = shift;
489 my $pI = shift;
490 $pI ||= 7.0; # initial guess for pH
491 my ( $pI_LB, $pI_UB ) = ( 0, 14.0 ); # initial bounds on the pI
492 my $q_tolerance =
493 0.0001; # close enough if either -$q_tolerance < charge < $q_tolerance or
494 my $pI_tolerance = 0.001; # pI_UB - pI_LB < $pI_tolerance
495 my $nNeg =
496 $aa_hash->{"D"} + $aa_hash->{"E"} + $aa_hash->{"C"} + $aa_hash->{"Y"};
497 my $nPos = $aa_hash->{"H"} + $aa_hash->{"K"} + $aa_hash->{"R"};
499 if ( $nNeg > 0 ) {
500 if ( $nPos > 0 ) {
501 while (1) {
502 my $Q = charge( $aa_hash, $pI );
503 if ( $Q < -$q_tolerance )
504 { #charge is negative - pI must be less than pH; lower the pH
505 $pI_UB = $pI;
507 elsif ( $Q > $q_tolerance )
508 { #charge is positive - pI must be greater than pH; raise the pH
509 $pI_LB = $pI;
511 else { # close enough to neutral - done
512 return $pI;
514 $pI = 0.5 * ( $pI_UB + $pI_LB );
515 if ( $pI_UB - $pI_LB < $pI_tolerance ) {
516 return $pI;
520 else { # negative aa's but no positive aa's -> always negative
521 return $pI_LB;
524 else { #no negative aa's
525 if ( $nPos > 0 ) {
526 return $pI_UB;
528 else { # no negative or positive aa's, return 7.0
529 return 7.0;
532 return;
535 sub extinction_coefficient{
537 my $sequence = uc (shift);
538 my $result = 0;
539 while($sequence){
540 my $char = chop $sequence;
541 if($char eq 'W'){ $result += 5690; }
542 elsif($char eq 'Y'){ $result += 1280; }
543 elsif($char eq 'C'){ $result += 120; }
545 return $result;