lib/Bio/SecreTary/AAComposition.pm

   1 package Bio::SecreTary::AAComposition;
   2 use strict;
   3 use warnings;
   4
   5 # this module just has some routines to calculate some
   6 # simple parameters characterizing the amino acid composition
   7 # of a peptide, such as aliphatic index, gravy index, number of
   8 # DRQPEN, number of GASDRQPEN, number of Nitrogen or Oxygen atoms ...
   9 # Molecular weight.
  10
  11 my %total_beta_strand_values = (A => 0.369, # Ala
  12                 C => 0.539, # Cys
  13                 D => 0.057, # Asp
  14                 E => 0.149, # Glu
  15                 F => 0.603, # Phe
  16                 G => 0.149, # Gly
  17                 H => 0.376, # His
  18                 I => 1.000, # Ile
  19                 K => 0.213, # Lys
  20                 L => 0.638, # Leu
  21                 M => 0.560, # Met
  22                 N => 0.142, # Asn
  23                 P => 0.000, # Pro
  24                 Q => 0.390, # Gln
  25                 R => 0.376, # Arg
  26                 S => 0.298, # Ser
  27                 T => 0.511, # Thr
  28                 V => 1.000, # Val
  29                 W => 0.809, # Trp
  30                 Y => 0.801, # Tyr
  31                 X => 0.449 # unknown
  32                 );
  33
  34 my %beta_sheet_values = (A => 0.220, # Ala
  35                 C => 0.520, # Cys
  36                 D => 0.116, # Asp
  37                 E => 0.132, # Glu
  38                 F => 0.645, # Phe
  39                 G => 0.188, # Gly
  40                 H => 0.316, # His
  41                 I => 0.897, # Ile
  42                 K => 0.228, # Lys
  43                 L => 0.563, # Leu
  44                 M => 0.531, # Met
  45                 N => 0.155, # Asn
  46                 P => 0.000, # Pro
  47                 Q => 0.302, # Gln
  48                 R => 0.351, # Arg
  49                 S => 0.356, # Ser
  50                 T => 0.538, # Thr
  51                 V => 1.000, # Val
  52                 W => 0.591, # Trp
  53                 Y => 0.566,  # Tyr
  54                 X => 0.411 # unknown
  55                 );
  56
  57 # Deleage & Roux ... beta turn
  58 my %beta_turn_values = (A => 0.338, # Ala
  59                 C => 0.448, # Cys
  60                 D => 0.591, # Asp
  61                 E => 0.561, # Glu
  62                 F => 0.237, # Phe
  63                 G => 1.000, # Gly
  64                 H => 0.451, # His
  65                 I => 0.000, # Ile
  66                 K => 0.656, # Lys
  67                 L => 0.265, # Leu
  68                 M => 0.121, # Met
  69                 N => 0.822, # Asn
  70                 P => 0.725, # Pro
  71                 Q => 0.467, # Gln
  72                 R => 0.415, # Arg
  73                 S => 0.664, # Ser
  74                 T => 0.308, # Thr
  75                 V => 0.091, # Val
  76                 W => 0.189, # Trp
  77                 Y => 0.343,  # Tyr
  78                 X => 0.435 # unknown
  79         );
  80
  81
  82 sub aa_frequencies
  83 { # given a aa sequence and a hash of aa/count pairs, add the counts of the aa's in the sequence to the hash.
  84         my $sequence = shift;
  85         my $AAFhash  = shift
  86                 ;  # ref. to hash with AA chars for keys, number of occurences for values
  87
  88                 my %AAFs = ();
  89
  90 #ref to array of 20 amino acids (1 char abbreviations), plus "X" for any other stuff.
  91         if ( !defined $AAFhash ) {
  92                 $AAFhash = \%AAFs;
  93                 my @aas = (
  94                                 "A", "C", "D", "E", "F", "G", "H", "I", "K", "L",
  95                                 "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"
  96                           );
  97
  98                 map { $AAFhash->{$_} = 0; } @aas;
  99         }
 100         my $seq_length_a = length $sequence;
 101         my $seq_length_b = 0;
 102         while ($sequence) {
 103                 my $AAchar = chop $sequence;
 104                 if ( exists $AAFhash->{$AAchar} ) {
 105                         $AAFhash->{$AAchar}++;
 106                 }
 107                 else {
 108                         $AAFhash->{"X"}++;
 109                 }
 110                 $seq_length_b++;
 111         }
 112         die
 113                 "in aa_frequencies, seqlength discrepancy: $seq_length_a,  $seq_length_b  \n"
 114                 unless ( $seq_length_a == $seq_length_b );
 115         $AAFhash->{"LENGTH"} = $seq_length_b;
 116         return $AAFhash;
 117 }
 118
 119 =head2 function aliphatic_index
 120
 121 Synopsis: aliphatic_index($sequence);
 122 Description: Calculates and returns the aliphatic index of the sequence.
 123 (See http://expasy.org/tools/protparam-doc.html for definition of aliphatic index.)
 124
 125
 126 =cut
 127
 128 sub aliphatic_index {
 129         my $sequence = shift;
 130         my $nA       = ( $sequence =~ tr/A// );
 131         my $nV       = ( $sequence =~ tr/V// );
 132         my $nL       = ( $sequence =~ tr/L// );
 133         my $nI       = ( $sequence =~ tr/I// );
 134         my $nX       = ( $sequence =~ tr/X// );
 135         my $L        = length($sequence) - $nX;
 136         if ( $L <= 0 ) {
 137                 warn "In aliphatic_index. ", length $sequence, "  $nX $sequence \n";
 138         }
 139         else {
 140                 return 100.0 * ( 1.0 * $nA + 2.9 * $nV + 3.9 * ( $nL + $nI ) ) / $L;
 141         }
 142         return;
 143 }
 144
 145 =head2 function gravy
 146
 147 Synopsis: gravy($sequence);
 148 Description: Calculates
 149 and returns the "gravy" index of the sequence.
 150 (See http://expasy.org/tools/protparam-doc.html for definition of
 151  gravy index.)
 152
 153 =cut
 154
 155 sub gravy {
 156         my $sequence = shift;
 157
 158 # Kyte and Doolittle hydropathy index: (from Wikipedia "Hydropathy index")
 159         my %Hydropaths = (
 160                         "A" => 1.80,
 161                         "R" => -4.50,
 162                         "N" => -3.50,
 163                         "D" => -3.50,
 164                         "C" => 2.5,
 165                         "E" => -3.50,
 166                         "Q" => -3.50,
 167                         "G" => -0.40,
 168                         "H" => -3.20,
 169                         "I" => 4.50,
 170                         "L" => 3.80,
 171                         "K" => -3.90,
 172                         "M" => 1.90,
 173                         "F" => 2.80,
 174                         "P" => -1.60,
 175                         "S" => -0.80,
 176                         "T" => -0.70,
 177                         "W" => -0.90,
 178                         "Y" => -1.30,
 179                         "V" => 4.20
 180                                 );
 181
 182         my $sum_h = 0;
 183         my $count = 0;
 184
 185         while ($sequence) {
 186                 my $char = chop $sequence;
 187                 if ( defined $Hydropaths{$char} )
 188                 {    # standard 20 aa's. Others (incl. X) not counted
 189                         $sum_h += $Hydropaths{$char};
 190                         $count++;
 191                 }
 192         }
 193
 194         if ( $count > 0 ) {
 195                 return $sum_h / $count;
 196         }
 197         else {
 198                 return;    #  -10000.0;
 199         }
 200 }
 201
 202 =head2 function nDRQPEN
 203
 204 Synopsis: nDRQPEN($sequence);
 205 Description: Calculates
 206 and returns the number of amino acids in the sequence which are DRQPE or N.
 207
 208 =cut
 209
 210 sub nDRQPEN {
 211         my $sequence = shift;
 212         my $count    = 0;
 213         while ($sequence) {
 214                 my $c = chop $sequence;
 215                 $count++ if ( $c =~ /[DRQPEN]/ );
 216         }
 217         return $count;
 218 }
 219
 220 =head2 function nGASDRQPEN
 221
 222 Synopsis: nGASDRQPEN($sequence);
 223 Description: Calculates and returns the number of amino acids
 224 in the sequence which are GASDRQPE or N.
 225
 226 =cut
 227
 228 sub nGASDRQPEN {
 229         my $sequence = shift;
 230         my $count    = 0;
 231         while ($sequence) {
 232                 my $c = chop $sequence;
 233                 $count++ if ( $c =~ /[GASDRQPEN]/ );
 234         }
 235         return $count;
 236 }
 237
 238 sub total_beta_strand{
 239         my $sequence = shift;
 240         my $window_size = shift || 9;
 241         my $normalize = shift || 1;
 242         return window_average(\%total_beta_strand_values, $sequence, $window_size, $normalize);
 243 }
 244
 245 sub beta_sheet{
 246         my $sequence = shift;
 247         my $window_size = shift || 9;
 248         my $normalize = shift || 1;
 249         return window_average(\%beta_sheet_values, $sequence, $window_size, $normalize);
 250 }
 251
 252 sub beta_turn{
 253         my $sequence = shift;
 254         my $window_size = shift || 9;
 255         my $normalize = shift || 1;
 256         return window_average(\%beta_turn_values, $sequence, $window_size, $normalize);
 257 }
 258
 259 sub window_average{
 260         my $aavals = shift; # ref to hash giving the values associated with each aa
 261                 my $sequence = shift;
 262         my $window_size = shift || 9;
 263         my $normalize = shift || 1;
 264         my $normalize_factor = ($normalize)? 1.0/$window_size: 1.0;
 265         my $hws = int($window_size/2);
 266         my @val_array = ( (0) x length $sequence);
 267         my ($minval, $maxval) = (0, 0);
 268         if(length $sequence >= $window_size){
 269                 my $val = 0;
 270                 foreach (0..$window_size-1){
 271                         $val += $aavals->{substr($sequence, $_, 1)};
 272                 }
 273         ($minval, $maxval) = ($val, $val);
 274                 $val_array[$hws] = $val * $normalize_factor;
 275                 my $i = 0;
 276                 while($i + $window_size < length $sequence){
 277                         $val -= $aavals->{substr($sequence, $i, 1)};
 278                         $val += $aavals->{substr($sequence, $i + $window_size, 1)};
 279                 $minval = $val if($val < $minval);
 280         $maxval = $val if($val > $maxval);
 281         $i++;
 282                         $val_array[$i + $hws] = $val * $normalize_factor;
 283                 }
 284         }
 285         return (\@val_array, $minval*$normalize_factor, $maxval*$normalize_factor); # return reference to val array
 286 }
 287
 288
 289
 290 =head2 function nVIL
 291
 292 Synopsis: nVIL($sequence);
 293 Description: Calculates
 294 and returns the number of amino acids in the sequence which are VI or L.
 295 as well as total (not counting X)
 296 =cut
 297
 298 sub nVIL {
 299         my $sequence = shift;
 300         my ($count, $count_vil)    = (0, 0);
 301         while ($sequence) {
 302                 my $c = chop $sequence;
 303                 $count++ if( $c =~ /[ACDEFGHIKLMNPQRSTVWY]/ );
 304                 $count_vil++ if ( $c =~ /[VIL]/ );
 305         }
 306         warn "count is 0. sequence: $sequence \n" if($count == 0);
 307         return ($count, $count_vil);
 308 }
 309
 310
 311
 312 =head2 function nNitrogen
 313
 314 Synopsis: nNitrogen($sequence);
 315 Description: Calculates
 316 and returns the number of Nitrogen atoms in this sequence.
 317
 318 =cut
 319
 320 sub nNitrogen {
 321         my $sequence = shift;
 322         my $count    = 0;
 323         while ($sequence) {
 324                 my $c = chop $sequence;
 325                 $count += _N_in_aa($c);
 326         }
 327         return $count;
 328 }
 329
 330 =head2 function nOxygen
 331
 332 Synopsis: nOxygen($sequence);
 333 Description: Calculates
 334 and returns the number of Oxygen atoms in the sequence. If the
 335 sequence which is analyzed includes a C-terminus then there would
 336 be one more O than the result here, but we are counting O's
 337 in a part of the protein at the N-terminus, so this function doesn't count
 338 that extra C-terminal O.
 339
 340 =cut
 341
 342 sub nOxygen {
 343         my $sequence = shift;
 344         my $count    = 0;
 345         while ($sequence) {
 346                 my $c = chop $sequence;
 347                 $count += _O_in_aa($c);
 348         }
 349         return $count;
 350 }
 351
 352 sub _N_in_aa {    # number of Nitrogen atoms in each kind of amino acid
 353
 354 # (see e.g. Wikipedia "proteinogenic amino acids")
 355         my $aa    = shift;
 356         my %Nhref = (        # these are the aa's with > 1 Nitrogen
 357                         "H" => 3,
 358                         "K" => 2,
 359                         "N" => 2,
 360                         "O" => 3,
 361                         "Q" => 2,
 362                         "R" => 4,
 363                         "W" => 2
 364                     );
 365         if ( exists $Nhref{$aa} ) {
 366                 return $Nhref{$aa};
 367         }
 368         else {
 369                 return 1;    # all others have 1 nitrogen.
 370         }
 371 }
 372
 373 sub _O_in_aa {       # Number of Oxygen atoms in each kind of amino acid
 374
 375 # (see e.g. Wikipedia "proteinogenic amino acids")
 376 # Only 1 Oxygen in COOH group counted.
 377         my $aa    = shift;
 378         my %Ohref = (
 379                         "D"  => 3,
 380                         "E", => 3,
 381                         "N"  => 2,
 382                         "O"  => 3,    # pyrrolysine
 383                         "Q"  => 2,
 384                         "S"  => 2,
 385                         "T"  => 2,
 386                         "Y"  => 2
 387                     );
 388         if ( exists $Ohref{$aa} ) {
 389                 return $Ohref{$aa};
 390         }
 391         else {
 392                 return 1;
 393         }
 394 }
 395
 396 sub molecular_weight {
 397         my $AAFhash    = shift;
 398         my %MolWeights = (
 399 # (see e.g. Wikipedia "proteinogenic amino acids")
 400                         "A" => 89.09,
 401                         "C" => 121.16,
 402                         "D" => 133.10,
 403                         "E" => 147.13,
 404                         "F" => 165.19,
 405                         "G" => 75.07,
 406                         "H" => 155.16,
 407                         "I" => 131.18,
 408                         "K" => 146.19,
 409                         "L" => 131.18,
 410                         "M" => 149.21,
 411                         "N" => 132.12,
 412                         "P" => 115.13,
 413                         "Q" => 146.15,
 414                         "R" => 174.20,
 415                         "S" => 105.09,
 416                         "T" => 119.12,
 417                         "V" => 117.15,
 418                         "W" => 204.23,
 419                         "Y" => 181.19,
 420                         );
 421
 422
 423
 424 #     "A" => 89,
 425 #     "R" => 174,
 426 #     "N" => 132,
 427 #     "D" => 133,
 428 #     "C" => 121,
 429 #     "E" => 147,
 430 #     "Q" => 146,
 431 #     "G" => 75,
 432 #     "H" => 155,
 433 #     "I" => 131,
 434 #     "L" => 131,
 435 #     "K" => 146,
 436 #     "M" => 149,
 437 #     "F" => 165,
 438 #     "P" => 115,
 439 #     "S" => 105,
 440 #     "T" => 119,
 441 #     "W" => 204,
 442 #     "Y" => 181,
 443 #     "V" => 117
 444 # );
 445
 446         my $sum_wf = 0;
 447         my $sum_f  = 0;
 448         my $length = $AAFhash->{"LENGTH"};
 449         foreach ( keys %$AAFhash ) {
 450                 next unless ( defined $MolWeights{$_} );
 451                 my $f = $AAFhash->{$_};    #frequency of this aa in this seq.
 452 #       print "$_,  $w  \n";
 453                         $sum_wf += $MolWeights{$_} * $f;
 454                 $sum_f  += $f;
 455         }
 456         if ( $sum_f > 0 ) {
 457
 458 # if there are X chars, consider them to have MW equal to avg of others.
 459                 $sum_wf *= $AAFhash->{"LENGTH"} / $sum_f;
 460         }
 461         $sum_wf -= 18 * ( $length - 1 )
 462                 ;    # subtract for the H2O molecules removed in forming peptide bonds.
 463                 return $sum_wf;
 464 }
 465
 466 sub charge
 467 {    # get the electric charge at given pH of the set of aa's stored in aa_hash
 468 # for pKa's (3.90, 4.07, etc.) see e.g.  Wikipedia "proteinogenic amino acids"
 469 # this does not include the charges of the terminal groups
 470         my $aa_hash = shift
 471                 ; # ref to hash storing the numbers in the segment analyzed of each of the charged aa's (DECY & HKR)
 472                 my $pH = shift;
 473         my $Q  = 0;
 474         $Q -= $aa_hash->{"D"} / ( 1.0 + 10**( 3.90 - $pH ) );
 475         $Q -= $aa_hash->{"E"} / ( 1.0 + 10**( 4.07 - $pH ) );
 476         $Q -= $aa_hash->{"C"} / ( 1.0 + 10**( 8.18 - $pH ) );
 477         $Q -= $aa_hash->{"Y"} / ( 1.0 + 10**( 10.48 - $pH ) );
 478
 479         $Q += $aa_hash->{"H"} / ( 1.0 + 10**( $pH - 6.04 ) );
 480         $Q += $aa_hash->{"K"} / ( 1.0 + 10**( $pH - 10.54 ) );
 481         $Q += $aa_hash->{"R"} / ( 1.0 + 10**( $pH - 12.48 ) );
 482         return $Q;
 483 }
 484
 485 sub isoelectric_point
 486 { # finds the isoelectric point pI, i.e. the pH at which the avg. net electric charge of the sequence is 0.
 487 # it might be interesting to get the derivative of Q wrt pH also.
 488         my $aa_hash = shift;
 489         my $pI      = shift;
 490         $pI ||= 7.0;    # initial guess for pH
 491                 my ( $pI_LB, $pI_UB ) = ( 0, 14.0 );    # initial bounds on the pI
 492                 my $q_tolerance =
 493                 0.0001;  # close enough if either -$q_tolerance < charge < $q_tolerance or
 494                 my $pI_tolerance = 0.001;    # pI_UB - pI_LB < $pI_tolerance
 495                 my $nNeg =
 496                 $aa_hash->{"D"} + $aa_hash->{"E"} + $aa_hash->{"C"} + $aa_hash->{"Y"};
 497         my $nPos = $aa_hash->{"H"} + $aa_hash->{"K"} + $aa_hash->{"R"};
 498
 499         if ( $nNeg > 0 ) {
 500                 if ( $nPos > 0 ) {
 501                         while (1) {
 502                                 my $Q = charge( $aa_hash, $pI );
 503                                 if ( $Q < -$q_tolerance )
 504                                 {    #charge is negative - pI must be less than pH; lower the pH
 505                                         $pI_UB = $pI;
 506                                 }
 507                                 elsif ( $Q > $q_tolerance )
 508                                 { #charge is positive - pI must be greater than pH; raise the pH
 509                                         $pI_LB = $pI;
 510                                 }
 511                                 else {    # close enough to neutral - done
 512                                         return $pI;
 513                                 }
 514                                 $pI = 0.5 * ( $pI_UB + $pI_LB );
 515                                 if ( $pI_UB - $pI_LB < $pI_tolerance ) {
 516                                         return $pI;
 517                                 }
 518                         }
 519                 }
 520                 else {    # negative aa's but no positive aa's -> always negative
 521                         return $pI_LB;
 522                 }
 523         }
 524         else {        #no negative aa's
 525                 if ( $nPos > 0 ) {
 526                         return $pI_UB;
 527                 }
 528                 else {    # no negative or positive aa's, return 7.0
 529                         return 7.0;
 530                 }
 531         }
 532         return;
 533 }
 534
 535 sub extinction_coefficient{
 536
 537         my $sequence = uc (shift);
 538         my $result = 0;
 539         while($sequence){
 540                 my $char = chop $sequence;
 541                 if($char eq 'W'){ $result += 5690; }
 542                 elsif($char eq 'Y'){ $result += 1280; }
 543                 elsif($char eq 'C'){ $result += 120; }
 544         }
 545         return $result;
 546 }
 547
 548 1;