1 #---------------------------------------------------------
5 Bio::Matrix::PSM::ProtMatrix - SiteMatrixI implementation, holds a
6 position scoring matrix (or position weight matrix) with log-odds scoring
11 use Bio::Matrix::PSM::ProtMatrix;
12 # Create from memory by supplying probability matrix hash both as strings or
13 # arrays where the frequencies Hash entries of the form lN refer to an array
14 # of position-specific log-odds scores for amino acid N. Hash entries of the
15 # form pN represent the position-specific probability of finding amino acid N.
18 'id' => 'A. thaliana protein atp1',
20 'lS' => [ '-2', '3', '-3', '2', '-3', '1', '1', '3' ],
21 'lF' => [ '-1', '-4', '0', '-5', '0', '-5', '-4', '-4' ],
22 'lT' => [ '-1', '1', '0', '1', '-2', '-1', '0', '1' ],
23 'lN' => [ '-3', '-1', '-2', '3', '-5', '5', '-2', '0' ],
24 'lK' => [ '-2', '0', '-3', '2', '-3', '2', '-3', '-1' ],
25 'lY' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-4', '-4' ],
26 'lE' => [ '-3', '4', '-3', '2', '-4', '-2', '-3', '2' ],
27 'lV' => [ '0', '-2', '1', '-4', '1', '-4', '-1', '-3' ],
28 'lQ' => [ '-1', '0', '-2', '3', '-4', '1', '-3', '0' ],
29 'lM' => [ '8', '-3', '8', '-3', '1', '-3', '-3', '-3' ],
30 'lC' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-3', '-3' ],
31 'lL' => [ '1', '-3', '1', '-4', '3', '-4', '-2', '-4' ],
32 'lA' => [ '-2', '1', '-2', '0', '-2', '-2', '2', '2' ],
33 'lW' => [ '-2', '-4', '-3', '-5', '-4', '-5', '-5', '-5' ],
34 'lP' => [ '-3', '-2', '-4', '-3', '-1', '-3', '6', '-3' ],
35 'lH' => [ '-2', '-2', '-3', '-2', '-5', '-2', '-2', '-3' ],
36 'lD' => [ '-4', '-1', '-3', '1', '-3', '-1', '-3', '4' ],
37 'lR' => [ '-2', '-1', '-3', '0', '-4', '4', '-4', '-3' ],
38 'lI' => [ '0', '-3', '0', '-4', '6', '-4', '-2', '-2' ],
39 'lG' => [ '-4', '-2', '-4', '-2', '-5', '-3', '-1', '-2' ],
40 'pS' => [ '0', '33', '0', '16', '1', '12', '11', '25' ],
41 'pF' => [ '0', '0', '2', '0', '3', '0', '0', '0' ],
42 'pT' => [ '0', '8', '7', '10', '1', '2', '7', '8' ],
43 'pN' => [ '0', '0', '2', '13', '0', '36', '1', '4' ],
44 'pK' => [ '0', '5', '0', '13', '1', '15', '0', '2' ],
45 'pY' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
46 'pE' => [ '0', '41', '1', '12', '0', '0', '0', '15' ],
47 'pV' => [ '0', '3', '9', '0', '2', '0', '3', '1' ],
48 'pQ' => [ '0', '0', '0', '15', '0', '4', '0', '3' ],
49 'pM' => [ '100', '0', '66', '0', '2', '0', '0', '0' ],
50 'pC' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
51 'pL' => [ '0', '0', '8', '0', '25', '0', '4', '0' ],
52 'pA' => [ '0', '10', '1', '9', '2', '0', '22', '16' ],
53 'pW' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
54 'pP' => [ '0', '0', '0', '0', '3', '1', '45', '0' ],
55 'pH' => [ '0', '0', '0', '0', '0', '0', '1', '0' ],
56 'pD' => [ '0', '0', '1', '7', '2', '2', '0', '22' ],
57 'pR' => [ '0', '0', '0', '3', '0', '27', '0', '0' ],
58 'pI' => [ '0', '0', '3', '0', '59', '1', '2', '3' ],
59 'pG' => [ '0', '0', '0', '1', '0', '0', '4', '1' ],
62 my $matrix = Bio::Matrix::PSM::ProtMatrix( %param );
65 my $site = Bio::Matrix::PSM::ProtMatrix->new(%param);
66 # Or get it from a file:
67 use Bio::Matrix::PSM::IO;
68 my $psmIO = Bio::Matrix::PSM::IO->new(-file => $file, -format => 'psi-blast');
69 while (my $psm = $psmIO->next_psm) {
70 #Now we have a Bio::Matrix::PSM::Psm object,
71 # see Bio::Matrix::PSM::PsmI for details
72 #This is a Bio::Matrix::PSM::ProtMatrix object now
73 my $matrix = $psm->matrix;
76 # Get a simple consensus, where alphabet is:
77 # {A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V,}
78 # choosing the highest probability or N if prob is too low
79 my $consensus = $site->consensus;
81 # Retrieving and using regular expressions:
82 my $regexp = $site->regexp;
83 my $count = grep($regexp,$seq);
84 my $count = ($seq=~ s/$regexp/$1/eg);
85 print "Motif $mid is present $count times in this sequence\n";
89 ProtMatrix is designed to provide some basic methods when working with
90 position scoring (weight) matrices related to protein sequences. A
91 protein PSM consists of 20 vectors with 20 frequencies (one per amino
92 acid per position). This is the minimum information you should
93 provide to construct a PSM object. The vectors can be provided as
94 strings with frequencies where the frequency is {0..a} and a=1. This
95 is the way MEME compressed representation of a matrix and it is quite
96 useful when working with relational DB. If arrays are provided as an
97 input (references to arrays actually) they can be any number, real or
98 integer (frequency or count).
100 When creating the object the constructor will check for positions that
101 equal 0. If such is found it will increase the count for all
102 positions by one and recalculate the frequency. Potential bug - if
103 you are using frequencies and one of the positions is 0 it will change
104 significantly. However, you should never have frequency that equals
107 Throws an exception if: You mix as an input array and string (for
108 example A matrix is given as array, C - as string). The position
109 vector is (0,0,0,0). One of the probability vectors is shorter than
112 Summary of the methods I use most frequently (details below):
114 iupac - return IUPAC compliant consensus as a string
115 score - Returns the score as a real number
116 IC - information content. Returns a real number
117 id - identifier. Returns a string
118 accession - accession number. Returns a string
119 next_pos - return the sequence probably for each letter, IUPAC
120 symbol, IUPAC probability and simple sequence
121 consenus letter for this position. Rewind at the end. Returns a hash.
122 pos - current position get/set. Returns an integer.
123 regexp - construct a regular expression based on IUPAC consensus.
124 For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
126 get_string - gets the probability vector for a single base as a string.
127 get_array - gets the probability vector for a single base as an array.
128 get_logs_array - gets the log-odds vector for a single base as an array.
130 New methods, which might be of interest to anyone who wants to store
131 PSM in a relational database without creating an entry for each
132 position is the ability to compress the PSM vector into a string with
133 losing usually less than 1% of the data. this can be done with:
135 my $str=$matrix->get_compressed_freq('A');
138 my $str=$matrix->get_compressed_logs('A');
140 Loading from a database should be done with new, but is not yet implemented.
141 However you can still uncompress such string with:
143 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
147 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
153 User feedback is an integral part of the evolution of this and other
154 Bioperl modules. Send your comments and suggestions preferably to one
155 of the Bioperl mailing lists. Your participation is much appreciated.
157 bioperl-l@bioperl.org - General discussion
158 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
162 Please direct usage questions or support issues to the mailing list:
164 I<bioperl-l@bioperl.org>
166 rather than to the module maintainer directly. Many experienced and
167 reponsive experts will be able look at the problem and quickly
168 address it. Please include a thorough description of the problem
169 with code and data examples if at all possible.
171 =head2 Reporting Bugs
173 Report bugs to the Bioperl bug tracking system to help us keep track
174 the bugs and their resolution. Bug reports can be submitted via the
177 https://github.com/bioperl/bioperl-live/issues
179 =head1 AUTHOR - James Thompson
181 Email tex@biosysadmin.com
188 # Let the code begin...
189 package Bio
::Matrix
::PSM
::ProtMatrix
;
192 use base
qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
197 Usage : my $site = Bio::Matrix::PSM::ProtMatrix->new(
205 Function : Creates a new Bio::Matrix::PSM::ProtMatrix object from memory
206 Throws : If inconsistent data for all vectors (all 20 amino acids) is
207 provided, if you mix input types (string vs array) or if a
210 Returns : Bio::Matrix::PSM::ProtMatrix object
211 Args : Hash references to log-odds scores and probabilities for
212 position-specific scoring info, e-value (optional), information
213 content (optional), id (optional), model for background distribution
214 of proteins (optional).
219 my ($class, @args) = @_;
220 my $self = $class->SUPER::new
(@args);
222 #Too many things to rearrange, and I am creating simultanuously >500
223 # such objects routinely, so this becomes performance issue
226 (my $key = shift @args) =~ s/-//gi; #deletes all dashes (only dashes)!
227 $input{$key} = shift @args;
230 # get a protein alphabet for processing log-odds scores and probabilities
231 # maybe change this later on to allow for non-standard aa lists?
232 my @alphabet = qw
/A R N D C Q E G H I L K M F P S T W Y V/;
234 foreach my $aa (@alphabet) {
235 $self->{"log$aa"} = defined($input{"l$aa"}) ?
$input{"l$aa"}
236 : $self->throw("Error: No log-odds information for $aa!");
237 $self->{"prob$aa"} = defined($input{"p$aa"}) ?
$input{"p$aa"}
238 : $self->throw("Error: No probability information for $aa!");
241 $self->{_position
} = 0;
242 $self->{IC
} = $input{IC
};
243 $self->{e_val
} = $input{e_val
};
244 $self->{sites
} = $input{sites
};
245 $self->{width
} = $input{width
};
246 $self->{accession_number
} = $input{accession_number
};
247 $self->{_correction
} = defined($input{correction
}) ?
248 $input{correction
} : 1 ; # Correction might be unwanted- supply your own
249 # No id provided, null for the sake of rel db
250 $self->{id
} = defined($input{id
}) ?
$input{id
} : 'null';
251 $self->{_alphabet
} = \
@alphabet;
253 #Make consensus, throw if any one of the vectors is shorter
254 $self = _calculate_consensus
($self,$input{model
});
260 Title : Returns an array (or array reference if desired) to the alphabet
262 Function : Returns an array (or array reference) containing all of the
263 allowable characters for this matrix.
266 Returns : Array or arrary reference.
274 return $self->{_alphabet
};
276 return @
{$self->{_alphabet
}};
280 =head2 _calculate_consensus
282 Title : _calculate_consensus
284 Function : Calculates the consensus sequence for this matrix.
292 sub _calculate_consensus
{
296 # verify that all of the array lengths in %probs are the same
297 my @lengths = map { scalar(@
$_) } map {$self->{"prob$_"}} @
{ $self->{_alphabet
} };
298 my $len = shift @lengths;
300 if ( $_ ne $len ) { $self->throw( "Probability matrix is damaged!\n" ) };
303 # iterate over probs, generate the most likely sequence and put it into
304 # $self->{seq}. Put the probability of this sequence into $self->{seqp}.
305 for ( my $i = 0; $i < $len; $i++ ) {
306 # get a list of all the probabilities at position $i, ordered by $self->{_alphabet}
307 my @probs = map { ${$self->{"prob$_"}}[$i] } @
{ $self->{_alphabet
} };
308 # calculate the consensus of @probs, put sequence into seqp and probabilities into seqp
309 (${$self->{seq
}}[$i],${$self->{seqp
}}[$i]) = $self->_to_cons( @probs, $thresh );
319 Function : Retrieves the next position features: frequencies for all 20 amino
320 acids, log-odds scores for all 20 amino acids at this position,
321 the main (consensus) letter at this position, the probability
322 for the consensus letter to occur at this position and the relative
323 current position as an integer.
326 Returns : hash (or hash reference) (pA,pR,pN,pD,...,logA,logR,logN,logD,aa,prob,rel)
327 - pN entries represent the probability for amino acid N
328 to be at this position
329 - logN entries represent the log-odds score for having amino acid
331 - aa is the consensus amino acid
332 - prob is the probability for the consensus amino acid to be at this
334 - rel is the relative index of the current position (integer)
342 $self->throw("instance method called on class") unless ref $self;
344 my $len = @
{$self->{seq
}};
345 my $pos = $self->{_position
};
347 # return a PSM if we're still within range
350 my %probs = map { ("p$_", ${$self->{"prob$_"}}[$pos]) } @
{$self->{_alphabet
}};
351 my %logs = map { ("l$_", ${$self->{"log$_"}}[$pos]) } @
{$self->{_alphabet
}};
352 my $base = ${$self->{seq
}}[$pos];
353 my $prob = ${$self->{seqp
}}[$pos];
355 $self->{_position
}++;
356 my %hash = ( %probs, %logs, base
=> $base, rel
=> $pos, prob
=> $prob );
358 # decide whether to return the hash or a reference to it
364 } else { # otherwise, reset $self->{_position} and return nothing
365 $self->{_position
} = 0;
375 Function : Gets/sets the current position.
378 Returns : Current position (integer).
379 Args : New position (integer).
385 if (@_) { $self->{_position
} = shift; }
386 return $self->{_position
};
394 Function : Gets/sets the e-value
404 if (@_) { $self->{e_val
} = shift; }
405 return $self->{e_val
};
413 Function : Position-specific information content.
416 Returns : Information content for current position.
417 Args : Information content for current position.
423 if (@_) { $self->{IC
} = shift; }
427 =head2 accession_number
429 Title : accession_number
431 Function: accession number, this will be unique id for the ProtMatrix object as
432 well for any other object, inheriting from ProtMatrix.
435 Returns : New accession number (string)
436 Args : Accession number (string)
440 sub accession_number
{
442 if (@_) { $self->{accession_number
} = shift; }
443 return $self->{accession_number
};
450 Function : Returns the consensus sequence for this PSM.
451 Throws : if supplied with thresold outisde 5..10 range
454 Args : (optional) threshold value 5 to 10 (corresponds to 50-100% at each position
461 $self->_calculate_consensus($thresh) if ($thresh); #Change of threshold
464 foreach my $letter (@
{$self->{seq
}}) {
465 $consensus .= $letter;
473 return $self->consensus;
481 Function: Returns given probability vector as a string. Useful if you want to
482 store things in a rel database, where arrays are not first choice
483 Throws : If the argument is outside {A,C,G,T}
486 Args : character {A,C,G,T}
495 my @prob = @
{$self->{"prob$base"}};
497 $self->throw( "No such base: $base\n");
500 foreach my $prob (@prob) {
501 my $corrected = $prob*10;
502 my $next = sprintf("%.0f",$corrected);
503 $next = 'a' if ($next eq '10');
515 Function : Returns the length of the site
525 my $width = @
{$self->{probA
}};
533 Function : Returns an array with frequencies for a specified amino acid.
536 Returns : Array representing frequencies for specified amino acid.
537 Args : Single amino acid (character).
543 my $letter = uc(shift);
545 $self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @
{$self->{_alphabet
}};
547 return @
{$self->{"prob$letter"}};
551 =head2 get_logs_array
553 Title : get_logs_array
555 Function : Returns an array with log_odds for a specified base
558 Returns : Array representing log-odds scores for specified amino acid.
559 Args : Single amino acid (character).
565 my $letter = uc(shift);
567 $self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @
{$self->{_alphabet
}};
569 return @
{$self->{"log$letter"}};
576 Function : Gets/sets the site id
586 if (@_) { $self->{id
} = shift; }
594 Function : Returns a case-insensitive regular expression which matches the
595 IUPAC convention. X's in consensus sequence will match anything.
599 Args : Threshold for calculating consensus sequence (number in range 0-100
600 representing a percentage). Threshold defaults to 20.
607 if ( @_ ) { my $threshold = shift };
609 my @alphabet = @
{$self->{_alphabet
}};
610 my $width = $self->width;
612 for ( $i = 0; $i < $width; $i++ ) {
613 # get an array of the residues at this position with p > $threshold
614 my @letters = map { uc($_).lc($_) } grep { $self->{"prob$_"}->[$i] >= $threshold } @alphabet;
617 if ( scalar(@letters) == 0 ) {
620 $reg = '['.join('',@letters).']';
628 return join '', @regexp;
637 Function : Returns an array of position-specific regular expressions.
638 X's in consensus sequence will match anything.
641 Returns : Array of position-specific regular expressions.
642 Args : Threshold for calculating consensus sequence (number in range 0-100
643 representing a percentage). Threshold defaults to 20.
644 Notes : Simply calls regexp method in list context.
651 return @
{ $self->regexp };
655 =head2 _compress_array
657 Title : _compress_array
659 Function : Will compress an array of real signed numbers to a string (ie vector of bytes)
660 -127 to +127 for bi-directional(signed) and 0..255 for unsigned ;
662 Example : Internal stuff
664 Args : array reference, followed by max value and direction (optional, defaults to 1),
665 direction of 1 is unsigned, anything else is signed.
669 sub _compress_array
{
670 my ($array,$lm,$direct)=@_;
672 return unless(($array) && ($lm));
673 $direct=1 unless ($direct);
674 my $k1= ($direct==1) ?
(255/$lm) : (127/$lm);
675 foreach my $c (@
{$array}) {
677 $c=-$lm if (($c<-$lm) && ($direct !=1));
678 $c=0 if (($c<0) && ($direct ==1));
679 my $byte=int($k1*$c);
680 $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
687 =head2 _uncompress_string
689 Title : _uncompress_string
691 Function : Will uncompress a string (vector of bytes) to create an array of real
692 signed numbers (opposite to_compress_array)
694 Example : Internal stuff
695 Returns : string, followed by max value and direction (optional, defaults to 1),
696 direction of 1 is unsigned, anything else is signed.
701 sub _uncompress_string
{
702 my ($str,$lm,$direct)=@_;
704 return unless(($str) && ($lm));
705 $direct=1 unless ($direct);
706 my $k1= ($direct==1) ?
(255/$lm) : (127/$lm);
707 while (my $c=chop($str)) {
709 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
717 =head2 get_compressed_freq
719 Title : get_compressed_freq
721 Function: A method to provide a compressed frequency vector. It uses one byte to
722 code the frequence for one of the probability vectors for one position.
723 Useful for relational database. Improvement of the previous 0..a coding.
725 Example : my $strA=$self->get_compressed_freq('A');
731 sub get_compressed_freq
{
738 @prob = @
{$self->{probA
}} unless (!defined($self->{probA
}));
742 @prob = @
{$self->{probG
}} unless (!defined($self->{probG
}));
746 @prob = @
{$self->{probC
}} unless (!defined($self->{probC
}));
750 @prob = @
{$self->{probT
}} unless (!defined($self->{probT
}));
753 $self->throw ("No such base: $base!\n");
755 my $str= _compress_array
(\
@prob,1,1);
759 =head2 sequence_match_weight
761 Title : sequence_match_weight
763 Function : This method will calculate the score of a match, based on the PSM
764 if such is associated with the matrix object. Returns undef if no
765 PSM data is available.
766 Throws : if the length of the sequence is different from the matrix width
767 Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
768 Returns : Floating point
773 sub sequence_match_weight
{
775 return unless ($self->{logA
});
777 my $seqlen = length($seq);
778 my $width = $self->width;
779 $self->throw("Error: Input sequence size ($seqlen) not equal to PSM size ($width)!\n")
780 unless (length($seq) == $self->width);
782 my ($score,$i) = (0,0);
783 foreach my $letter ( split //, $seq ) {
784 # add up the score for this position
785 $score += $self->{"log$letter"}->[$i];
796 Function: Converts a single position to IUPAC compliant symbol and returns its probability.
797 Currently returns the most likely amino acid/probability combination.
800 Returns : char, real number representing an amino acid and a probability.
801 Args : real numbers for all 20 amino acids (ordered by alphabet contained
802 in $self->{_alphabet}, minimum probability threshold.
807 my ($self,@probs,$thresh) = @_;
809 # provide a default threshold of 5, corresponds to 5% threshold for
810 # inferring that the aa at any position is the true aa
811 $thresh = 5 unless ( defined $thresh );
813 my ($IUPAC_aa,$max_prob) = ('X',$thresh);
814 for my $aa ( @
{$self->{_alphabet
}} ) {
815 my $prob = shift @probs;
816 if ( $prob > $max_prob ) {
822 return $IUPAC_aa, $max_prob;
829 Function: Converts a single position to simple consensus character and returns
830 its probability. Currently just calls the _to_IUPAC subroutine.
833 Returns : char, real number
834 Args : real numbers for A,C,G,T (positional)
839 return _to_IUPAC
( @_ );
842 =head2 get_all_vectors
844 Title : get_all_vectors
846 Function : returns all possible sequence vectors to satisfy the PFM under
848 Throws : If threshold outside of 0..1 (no sense to do that)
849 Example : my @vectors = $self->get_all_vectors(4);
850 Returns : Array of strings
851 Args : (optional) floating
855 #sub get_all_vectors {
857 # my $thresh = shift;
859 # $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
861 # my @seq = split(//,$self->consensus($thresh*10));
863 # for my $i (0..@{$self->{probA}}) {
864 # push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
865 # push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
866 # push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
867 # push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
868 # push @{$perm[$i]},'N' if ($seq[$i] eq 'N');
870 # my $fpos=shift @perm;
871 # my @strings=@$fpos;
872 # foreach my $pos (@perm) {
874 # foreach my $let (@$pos) {
875 # foreach my $string (@strings) {
876 # my $newstring = $string . $let;
877 # push @newstr,$newstring;