1 package Bio
::SecreTary
::AAComposition
;
5 # this module just has some routines to calculate some
6 # simple parameters characterizing the amino acid composition
7 # of a peptide, such as aliphatic index, gravy index, number of
8 # DRQPEN, number of GASDRQPEN, number of Nitrogen or Oxygen atoms ...
11 my %total_beta_strand_values = (A
=> 0.369, # Ala
34 my %beta_sheet_values = (A
=> 0.220, # Ala
57 # Deleage & Roux ... beta turn
58 my %beta_turn_values = (A
=> 0.338, # Ala
83 { # given a aa sequence and a hash of aa/count pairs, add the counts of the aa's in the sequence to the hash.
86 ; # ref. to hash with AA chars for keys, number of occurences for values
90 #ref to array of 20 amino acids (1 char abbreviations), plus "X" for any other stuff.
91 if ( !defined $AAFhash ) {
94 "A", "C", "D", "E", "F", "G", "H", "I", "K", "L",
95 "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"
98 map { $AAFhash->{$_} = 0; } @aas;
100 my $seq_length_a = length $sequence;
101 my $seq_length_b = 0;
103 my $AAchar = chop $sequence;
104 if ( exists $AAFhash->{$AAchar} ) {
105 $AAFhash->{$AAchar}++;
113 "in aa_frequencies, seqlength discrepancy: $seq_length_a, $seq_length_b \n"
114 unless ( $seq_length_a == $seq_length_b );
115 $AAFhash->{"LENGTH"} = $seq_length_b;
119 =head2 function aliphatic_index
121 Synopsis: aliphatic_index($sequence);
122 Description: Calculates and returns the aliphatic index of the sequence.
123 (See http://expasy.org/tools/protparam-doc.html for definition of aliphatic index.)
128 sub aliphatic_index
{
129 my $sequence = shift;
130 my $nA = ( $sequence =~ tr/A// );
131 my $nV = ( $sequence =~ tr/V// );
132 my $nL = ( $sequence =~ tr/L// );
133 my $nI = ( $sequence =~ tr/I// );
134 my $nX = ( $sequence =~ tr/X// );
135 my $L = length($sequence) - $nX;
137 warn "In aliphatic_index. ", length $sequence, " $nX $sequence \n";
140 return 100.0 * ( 1.0 * $nA + 2.9 * $nV + 3.9 * ( $nL + $nI ) ) / $L;
145 =head2 function gravy
147 Synopsis: gravy($sequence);
148 Description: Calculates
149 and returns the "gravy" index of the sequence.
150 (See http://expasy.org/tools/protparam-doc.html for definition of
156 my $sequence = shift;
158 # Kyte and Doolittle hydropathy index: (from Wikipedia "Hydropathy index")
186 my $char = chop $sequence;
187 if ( defined $Hydropaths{$char} )
188 { # standard 20 aa's. Others (incl. X) not counted
189 $sum_h += $Hydropaths{$char};
195 return $sum_h / $count;
202 =head2 function nDRQPEN
204 Synopsis: nDRQPEN($sequence);
205 Description: Calculates
206 and returns the number of amino acids in the sequence which are DRQPE or N.
211 my $sequence = shift;
214 my $c = chop $sequence;
215 $count++ if ( $c =~ /[DRQPEN]/ );
220 =head2 function nGASDRQPEN
222 Synopsis: nGASDRQPEN($sequence);
223 Description: Calculates and returns the number of amino acids
224 in the sequence which are GASDRQPE or N.
229 my $sequence = shift;
232 my $c = chop $sequence;
233 $count++ if ( $c =~ /[GASDRQPEN]/ );
238 sub total_beta_strand
{
239 my $sequence = shift;
240 my $window_size = shift || 9;
241 my $normalize = shift || 1;
242 return window_average
(\
%total_beta_strand_values, $sequence, $window_size, $normalize);
246 my $sequence = shift;
247 my $window_size = shift || 9;
248 my $normalize = shift || 1;
249 return window_average
(\
%beta_sheet_values, $sequence, $window_size, $normalize);
253 my $sequence = shift;
254 my $window_size = shift || 9;
255 my $normalize = shift || 1;
256 return window_average
(\
%beta_turn_values, $sequence, $window_size, $normalize);
260 my $aavals = shift; # ref to hash giving the values associated with each aa
261 my $sequence = shift;
262 my $window_size = shift || 9;
263 my $normalize = shift || 1;
264 my $normalize_factor = ($normalize)?
1.0/$window_size: 1.0;
265 my $hws = int($window_size/2);
266 my @val_array = ( (0) x
length $sequence);
267 my ($minval, $maxval) = (0, 0);
268 if(length $sequence >= $window_size){
270 foreach (0..$window_size-1){
271 $val += $aavals->{substr($sequence, $_, 1)};
273 ($minval, $maxval) = ($val, $val);
274 $val_array[$hws] = $val * $normalize_factor;
276 while($i + $window_size < length $sequence){
277 $val -= $aavals->{substr($sequence, $i, 1)};
278 $val += $aavals->{substr($sequence, $i + $window_size, 1)};
279 $minval = $val if($val < $minval);
280 $maxval = $val if($val > $maxval);
282 $val_array[$i + $hws] = $val * $normalize_factor;
285 return (\
@val_array, $minval*$normalize_factor, $maxval*$normalize_factor); # return reference to val array
292 Synopsis: nVIL($sequence);
293 Description: Calculates
294 and returns the number of amino acids in the sequence which are VI or L.
295 as well as total (not counting X)
299 my $sequence = shift;
300 my ($count, $count_vil) = (0, 0);
302 my $c = chop $sequence;
303 $count++ if( $c =~ /[ACDEFGHIKLMNPQRSTVWY]/ );
304 $count_vil++ if ( $c =~ /[VIL]/ );
306 warn "count is 0. sequence: $sequence \n" if($count == 0);
307 return ($count, $count_vil);
312 =head2 function nNitrogen
314 Synopsis: nNitrogen($sequence);
315 Description: Calculates
316 and returns the number of Nitrogen atoms in this sequence.
321 my $sequence = shift;
324 my $c = chop $sequence;
325 $count += _N_in_aa
($c);
330 =head2 function nOxygen
332 Synopsis: nOxygen($sequence);
333 Description: Calculates
334 and returns the number of Oxygen atoms in the sequence. If the
335 sequence which is analyzed includes a C-terminus then there would
336 be one more O than the result here, but we are counting O's
337 in a part of the protein at the N-terminus, so this function doesn't count
338 that extra C-terminal O.
343 my $sequence = shift;
346 my $c = chop $sequence;
347 $count += _O_in_aa
($c);
352 sub _N_in_aa
{ # number of Nitrogen atoms in each kind of amino acid
354 # (see e.g. Wikipedia "proteinogenic amino acids")
356 my %Nhref = ( # these are the aa's with > 1 Nitrogen
365 if ( exists $Nhref{$aa} ) {
369 return 1; # all others have 1 nitrogen.
373 sub _O_in_aa
{ # Number of Oxygen atoms in each kind of amino acid
375 # (see e.g. Wikipedia "proteinogenic amino acids")
376 # Only 1 Oxygen in COOH group counted.
382 "O" => 3, # pyrrolysine
388 if ( exists $Ohref{$aa} ) {
396 sub molecular_weight
{
399 # (see e.g. Wikipedia "proteinogenic amino acids")
448 my $length = $AAFhash->{"LENGTH"};
449 foreach ( keys %$AAFhash ) {
450 next unless ( defined $MolWeights{$_} );
451 my $f = $AAFhash->{$_}; #frequency of this aa in this seq.
453 $sum_wf += $MolWeights{$_} * $f;
458 # if there are X chars, consider them to have MW equal to avg of others.
459 $sum_wf *= $AAFhash->{"LENGTH"} / $sum_f;
461 $sum_wf -= 18 * ( $length - 1 )
462 ; # subtract for the H2O molecules removed in forming peptide bonds.
467 { # get the electric charge at given pH of the set of aa's stored in aa_hash
468 # for pKa's (3.90, 4.07, etc.) see e.g. Wikipedia "proteinogenic amino acids"
469 # this does not include the charges of the terminal groups
471 ; # ref to hash storing the numbers in the segment analyzed of each of the charged aa's (DECY & HKR)
474 $Q -= $aa_hash->{"D"} / ( 1.0 + 10**( 3.90 - $pH ) );
475 $Q -= $aa_hash->{"E"} / ( 1.0 + 10**( 4.07 - $pH ) );
476 $Q -= $aa_hash->{"C"} / ( 1.0 + 10**( 8.18 - $pH ) );
477 $Q -= $aa_hash->{"Y"} / ( 1.0 + 10**( 10.48 - $pH ) );
479 $Q += $aa_hash->{"H"} / ( 1.0 + 10**( $pH - 6.04 ) );
480 $Q += $aa_hash->{"K"} / ( 1.0 + 10**( $pH - 10.54 ) );
481 $Q += $aa_hash->{"R"} / ( 1.0 + 10**( $pH - 12.48 ) );
485 sub isoelectric_point
486 { # finds the isoelectric point pI, i.e. the pH at which the avg. net electric charge of the sequence is 0.
487 # it might be interesting to get the derivative of Q wrt pH also.
490 $pI ||= 7.0; # initial guess for pH
491 my ( $pI_LB, $pI_UB ) = ( 0, 14.0 ); # initial bounds on the pI
493 0.0001; # close enough if either -$q_tolerance < charge < $q_tolerance or
494 my $pI_tolerance = 0.001; # pI_UB - pI_LB < $pI_tolerance
496 $aa_hash->{"D"} + $aa_hash->{"E"} + $aa_hash->{"C"} + $aa_hash->{"Y"};
497 my $nPos = $aa_hash->{"H"} + $aa_hash->{"K"} + $aa_hash->{"R"};
502 my $Q = charge
( $aa_hash, $pI );
503 if ( $Q < -$q_tolerance )
504 { #charge is negative - pI must be less than pH; lower the pH
507 elsif ( $Q > $q_tolerance )
508 { #charge is positive - pI must be greater than pH; raise the pH
511 else { # close enough to neutral - done
514 $pI = 0.5 * ( $pI_UB + $pI_LB );
515 if ( $pI_UB - $pI_LB < $pI_tolerance ) {
520 else { # negative aa's but no positive aa's -> always negative
524 else { #no negative aa's
528 else { # no negative or positive aa's, return 7.0
535 sub extinction_coefficient
{
537 my $sequence = uc (shift);
540 my $char = chop $sequence;
541 if($char eq 'W'){ $result += 5690; }
542 elsif($char eq 'Y'){ $result += 1280; }
543 elsif($char eq 'C'){ $result += 120; }