1 #---------------------------------------------------------
5 Bio::Matrix::PSM::SiteMatrix - SiteMatrixI implementation, holds a
6 position scoring matrix (or position weight matrix) and log-odds
10 use Bio::Matrix::PSM::SiteMatrix;
11 # Create from memory by supplying probability matrix hash
12 # both as strings or arrays
13 # where the frequencies $a,$c,$g and $t are supplied either as
14 # arrayref or string. Accordingly, lA, lC, lG and lT are the log
15 # odds (only as arrays, no checks done right now)
16 my ($a,$c,$g,$t,$score,$ic, $mid)=@_;
18 my ($a,$c,$g,$t,$score,$ic,$mid)=('05a011','110550','400001',
19 '100104',0.001,19.2,'CRE1');
20 #Where a stands for all (this frequency=1), see explanation below
21 my %param=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,
22 -lA=>$la, -lC=>$lc,-lG=>$lg,-lT=>$l,
23 -IC=>$ic,-e_val=>$score, -id=>$mid);
24 my $site=Bio::Matrix::PSM::SiteMatrix->new(%param);
25 #Or get it from a file:
26 use Bio::Matrix::PSM::IO;
27 my $psmIO= Bio::Matrix::PSM::IO->new(-file=>$file, -format=>'transfac');
28 while (my $psm=$psmIO->next_psm) {
29 #Now we have a Bio::Matrix::PSM::Psm object,
30 # see Bio::Matrix::PSM::PsmI for details
31 #This is a Bio::Matrix::PSM::SiteMatrix object now
32 my $matrix=$psm->matrix;
35 # Get a simple consensus, where alphabet is {A,C,G,T,N},
36 # choosing the character that both satisfies a supplied or default threshold
37 # frequency and is the most frequenct character at each position, or N.
38 # So for the position with A, C, G, T frequencies of 0.5, 0.25, 0.10, 0.15,
39 # the simple consensus character will be 'A', whilst for 0.5, 0.5, 0, 0 it
41 my $consensus=$site->consensus;
43 # Get the IUPAC ambiguity code representation of the data in the matrix.
44 # Because the frequencies may have been pseudo-count corrected, insignificant
45 # frequences (below 0.05 by default) are ignored. So a position with
46 # A, C, G, T frequencies of 0.5, 0.5, 0.01, 0.01 will get the IUPAC code 'M',
47 # while 0.97, 0.01, 0.01, 0.01 will get the code 'A' and
48 # 0.25, 0.25, 0.25, 0.25 would get 'N'.
49 my $iupac=$site->IUPAC;
51 # Getting/using regular expression (a representation of the IUPAC string)
52 my $regexp=$site->regexp;
53 my $count=grep($regexp,$seq);
54 my $count=($seq=~ s/$regexp/$1/eg);
55 print "Motif $mid is present $count times in this sequence\n";
59 SiteMatrix is designed to provide some basic methods when working with position
60 scoring (weight) matrices, such as transcription factor binding sites for
61 example. A DNA PSM consists of four vectors with frequencies {A,C,G,T}. This is
62 the minimum information you should provide to construct a PSM object. The
63 vectors can be provided as strings with frequenciesx10 rounded to an int, going
64 from {0..a} and 'a' represents the maximum (10). This is like MEME's compressed
65 representation of a matrix and it is quite useful when working with relational
66 DB. If arrays are provided as an input (references to arrays actually) they can
67 be any number, real or integer (frequency or count).
69 When creating the object you can ask the constructor to make a simple pseudo
70 count correction by adding a number (typically 1) to all positions (with the
71 -correction option). After adding the number the frequencies will be
72 calculated. Only use correction when you supply counts, not frequencies.
74 Throws an exception if: You mix as an input array and string (for example A
75 matrix is given as array, C - as string). The position vector is (0,0,0,0). One
76 of the probability vectors is shorter than the rest.
78 Summary of the methods I use most frequently (details below):
80 iupac - return IUPAC compliant consensus as a string
81 score - Returns the score as a real number
82 IC - information content. Returns a real number
83 id - identifier. Returns a string
84 accession - accession number. Returns a string
85 next_pos - return the sequence probably for each letter, IUPAC
86 symbol, IUPAC probability and simple sequence
87 consenus letter for this position. Rewind at the end. Returns a hash.
88 pos - current position get/set. Returns an integer.
89 regexp - construct a regular expression based on IUPAC consensus.
90 For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
92 get_string - gets the probability vector for a single base as a string.
93 get_array - gets the probability vector for a single base as an array.
94 get_logs_array - gets the log-odds vector for a single base as an array.
96 New methods, which might be of interest to anyone who wants to store
97 PSM in a relational database without creating an entry for each
98 position is the ability to compress the PSM vector into a string with
99 losing usually less than 1% of the data. this can be done with:
101 my $str=$matrix->get_compressed_freq('A');
103 my $str=$matrix->get_compressed_logs('A');
105 Loading from a database should be done with new, but is not yest implemented.
106 However you can still uncompress such string with:
108 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
110 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
116 User feedback is an integral part of the evolution of this and other
117 Bioperl modules. Send your comments and suggestions preferably to one
118 of the Bioperl mailing lists. Your participation is much appreciated.
120 bioperl-l@bioperl.org - General discussion
121 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
125 Please direct usage questions or support issues to the mailing list:
127 I<bioperl-l@bioperl.org>
129 rather than to the module maintainer directly. Many experienced and
130 reponsive experts will be able look at the problem and quickly
131 address it. Please include a thorough description of the problem
132 with code and data examples if at all possible.
134 =head2 Reporting Bugs
136 Report bugs to the Bioperl bug tracking system to help us keep track
137 the bugs and their resolution. Bug reports can be submitted via the
140 https://github.com/bioperl/bioperl-live/issues
142 =head1 AUTHOR - Stefan Kirov
148 Sendu Bala, bix@sendu.me.uk
152 The rest of the documentation details each of the object methods.
153 Internal methods are usually preceded with a _
157 # Let the code begin...
158 package Bio
::Matrix
::PSM
::SiteMatrix
;
161 use base
qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
166 Usage : my $site=Bio::Matrix::PSM::SiteMatrix->new(-pA=>$a,-pC=>$c,
171 Function: Creates a new Bio::Matrix::PSM::SiteMatrix object from memory
172 Throws : If inconsistent data for all vectors (A,C,G and T) is
173 provided, if you mix input types (string vs array) or if a
175 Returns : Bio::Matrix::PSM::SiteMatrix object
176 Args : -pA => vector with the frequencies or counts of A
180 -lA => vector for the log of A
181 -lC => vector for the log of C
182 -lG => vector for the log of G
183 -lT => vector for the log of T
184 -IC => real number, the information content of this matrix
185 -e_val => real number, the expect value
186 -id => string, an identifier
187 -width => int, width of the matrix in nucleotides
188 -sites => int, the number of sites that went into this matrix
189 -model => hash ref, background frequencies for A, C, G and T
190 -correction => number, the number to add to all positions to achieve
191 pseudo count correction (default 0: no correction)
192 NB: do not use correction when your input is
194 -accession_number => string, an accession number
196 Vectors can be strings of the frequencies where the frequencies are
197 multiplied by 10 and rounded to the nearest whole number, and where
198 'a' is used to denote the maximal frequency 10. There should be no
199 punctuation (spaces etc.) in the string. For example, 'a0501'.
200 Alternatively frequencies or counts can be represented by an array
201 ref containing the counts, frequencies or logs as any kind of
207 my ($class, @args) = @_;
208 my $self = $class->SUPER::new
(@args);
210 # Too many things to rearrange, and I am creating simultanuously >500
211 # such objects routinely, so this becomes performance issue
214 (my $key = shift @args) =~ s/-//g; #deletes all dashes (only dashes)!
215 $input{$key} = shift @args;
217 $self->{_position
} = 0;
218 $self->{IC
} = $input{IC
};
219 $self->{e_val
} = $input{e_val
};
220 $self->{width
} = $input{width
};
221 $self->{logA
} = $input{lA
};
222 $self->{logC
} = $input{lC
};
223 $self->{logG
} = $input{lG
};
224 $self->{logT
} = $input{lT
};
225 $self->{sites
} = $input{sites
};
226 $self->{id
} = $input{id
} || 'null';
227 $self->{correction
} = $input{correction
} || 0;
228 $self->{accession_number
} = $input{accession_number
};
229 return $self unless (defined($input{pA
}) && defined($input{pC
}) && defined($input{pG
}) && defined($input{pT
}));
231 # This should go to _initialize?
232 # Check for input type- no mixing alllowed, throw ex
233 if (ref($input{pA
}) =~ /ARRAY/i ) {
234 $self->throw("Mixing matrix data types not allowed: C is not reference") unless(ref($input{pC
}));
235 $self->throw("Mixing matrix data types not allowed: G is not reference") unless (ref($input{pG
}));
236 $self->throw("Mixing matrix data types not allowed: T is not reference") unless (ref($input{pT
}));
237 $self->{probA
} = $input{pA
};
238 $self->{probC
} = $input{pC
};
239 $self->{probG
} = $input{pG
};
240 $self->{probT
} = $input{pT
};
243 $self->throw("Mixing matrix data types not allowed: C is reference") if (ref($input{pC
}));
244 $self->throw("Mixing matrix data types not allowed: G is reference") if (ref($input{pG
}));
245 $self->throw("Mixing matrix data types not allowed: T is reference") if (ref($input{pT
}));
246 $self->{probA
} = [split(//,$input{pA
})];
247 $self->{probC
} = [split(//,$input{pC
})];
248 $self->{probG
} = [split(//,$input{pG
})];
249 $self->{probT
} = [split(//,$input{pT
})];
250 for (my $i=0; $i<= @
{$self->{probA
}}+1; $i++) {
251 # we implictely assume these are MEME-style frequencies x 10, so
252 # 'a' represents the 'maximum', 10. Other positions can actually
253 # add up to over 10 due to rounding, but I don't think that is a
255 if (${$self->{probA
}}[$i] and ${$self->{probA
}}[$i] eq 'a') {
256 ${$self->{probA
}}[$i]='10';
258 if (${$self->{probC
}}[$i] and ${$self->{probC
}}[$i] eq 'a') {
259 ${$self->{probC
}}[$i]='10';
261 if (${$self->{probG
}}[$i] and ${$self->{probG
}}[$i] eq 'a') {
262 ${$self->{probG
}}[$i]='10';
264 if (${$self->{probT
}}[$i] and ${$self->{probT
}}[$i] eq 'a') {
265 ${$self->{probT
}}[$i]='10';
270 # Check for position with 0 for all bases, throw exception if so
271 for (my $i=0;$i <= $#{$self->{probA}}; $i++) {
272 if ((${$self->{probA
}}[$i] + ${$self->{probC
}}[$i] + ${$self->{probG
}}[$i] + ${$self->{probT
}}[$i]) == 0) {
273 $self->throw("Position meaningless-all frequencies are 0");
276 # apply pseudo-count correction to all values - this will result in
277 # very bad frequencies if the input is already frequences and a
278 # correction value as large as 1 is used!
279 if ($self->{correction
}) {
280 ${$self->{probA
}}[$i] += $self->{correction
};
281 ${$self->{probC
}}[$i] += $self->{correction
};
282 ${$self->{probG
}}[$i] += $self->{correction
};
283 ${$self->{probT
}}[$i] += $self->{correction
};
286 # (re)calculate frequencies
287 my $div= ${$self->{probA
}}[$i] + ${$self->{probC
}}[$i] + ${$self->{probG
}}[$i] + ${$self->{probT
}}[$i];
288 ${$self->{probA
}}[$i]=${$self->{probA
}}[$i]/$div;
289 ${$self->{probC
}}[$i]=${$self->{probC
}}[$i]/$div;
290 ${$self->{probG
}}[$i]=${$self->{probG
}}[$i]/$div;
291 ${$self->{probT
}}[$i]=${$self->{probT
}}[$i]/$div;
295 if ((!defined($self->{logA
})) && ($input{model
})) {
296 $self->calc_weight($input{model
});
299 # Make consensus, throw if any one of the vectors is shorter
300 $self->_calculate_consensus;
304 =head2 _calculate_consensus
306 Title : _calculate_consensus
307 Function: Internal stuff
311 sub _calculate_consensus
{
313 my ($lc,$lt,$lg)=($#{$self->{probC}},$#{$self->{probT}},$#{$self->{probG}});
314 my $len=$#{$self->{probA}};
315 $self->throw("Probability matrix is damaged for C: $len vs $lc") if ($len != $lc);
316 $self->throw("Probability matrix is damaged for T: $len vs $lt") if ($len != $lt);
317 $self->throw("Probability matrix is damaged for G: $len vs $lg") if ($len != $lg);
318 for (my $i=0; $i<$len+1; $i++) {
319 #*** IUPACp values not actually used (eg. by next_pos)
320 (${$self->{IUPAC
}}[$i],${$self->{IUPACp
}}[$i])=_to_IUPAC
(${$self->{probA
}}[$i], ${$self->{probC
}}[$i], ${$self->{probG
}}[$i], ${$self->{probT
}}[$i]);
321 (${$self->{seq
}}[$i], ${$self->{seqp
}}[$i]) = _to_cons
(${$self->{probA
}}[$i], ${$self->{probC
}}[$i], ${$self->{probG
}}[$i], ${$self->{probT
}}[$i]);
329 Usage : $obj->calc_weight({A=>0.2562, C=>0.2438, G=>0.2432, T=>0.2568});
330 Function: Recalculates the PSM (or weights) based on the PFM (the frequency
331 matrix) and user supplied background model.
332 Throws : if no model is supplied
334 Args : reference to a hash with background frequencies for A,C,G and T
339 my ($self, $model) = @_;
341 $model{probA
}=$model->{A
};
342 $model{probC
}=$model->{C
};
343 $model{probG
}=$model->{G
};
344 $model{probT
}=$model->{T
};
345 foreach my $let (qw(probA probC probG probT)) {
347 $self->throw('You did not provide valid model\n') unless (($model{$let}>0) && ($model{$let}<1));
348 foreach my $f (@
{$self->{$let}}) {
349 my $w=log($f)-log($model{$let});
354 $self->{$llet}=\
@str;
363 Function: Retrieves the next position features: frequencies for A,C,G,T, the
364 main letter (as in consensus) and the probabilty for this letter to
365 occur at this position and the current position
366 Returns : hash (pA,pC,pG,pT,logA,logC,logG,logT,base,prob,rel)
373 $self->throw("instance method called on class") unless ref $self;
374 my $len=@
{$self->{seq
}};
375 my $pos=$self->{_position
};
378 my $pA=${$self->{probA
}}[$pos];
379 my $pC=${$self->{probC
}}[$pos];
380 my $pG=${$self->{probG
}}[$pos];
381 my $pT=${$self->{probT
}}[$pos];
382 my $lA=${$self->{logA
}}[$pos];
383 my $lC=${$self->{logC
}}[$pos];
384 my $lG=${$self->{logG
}}[$pos];
385 my $lT=${$self->{logT
}}[$pos];
386 my $base=${$self->{seq
}}[$pos];
387 my $prob=${$self->{seqp
}}[$pos];
388 $self->{_position
}++;
389 my %seq=(pA
=>$pA,pT
=>$pT,pC
=>$pC,pG
=>$pG, lA
=>$lA,lT
=>$lT,lC
=>$lC,lG
=>$lG,base
=>$base,rel
=>$pos, prob
=>$prob);
392 else {$self->{_position
}=0; return;}
399 Function: Gets/sets the current position. Converts to 0 if argument is minus
400 and to width if greater than width
408 my $prev = $self->{_position
};
409 if (@_) { $self->{_position
} = shift; }
417 Function: Gets/sets the e-value
418 Returns : real number
419 Args : none to get, real number to set
425 my $prev = $self->{e_val
};
426 if (@_) { $self->{e_val
} = shift; }
434 Function: Get/set the Information Content
435 Returns : real number
436 Args : none to get, real number to set
442 my $prev = $self->{IC
};
443 if (@_) { $self->{IC
} = shift; }
447 =head2 accession_number
449 Title : accession_number
450 Function: Get/set the accession number, this will be unique id for the
451 SiteMatrix object as well for any other object, inheriting from
454 Args : none to get, string to set
458 sub accession_number
{
460 my $prev = $self->{accession_number
};
461 if (@_) { $self->{accession_number
} = shift; }
469 Function: Returns the consensus
471 Args : (optional) threshold value 1 to 10, default 5
472 '5' means the returned characters had a 50% or higher presence at
478 my ($self, $thresh) = @_;
480 my $len=$#{$self->{probA}};
481 for (my $i=0; $i<$len+1; $i++) {
482 (${$self->{seq
}}[$i], ${$self->{seqp
}}[$i]) = _to_cons
(${$self->{probA
}}[$i], ${$self->{probC
}}[$i], ${$self->{probG
}}[$i], ${$self->{probT
}}[$i], $thresh);
486 foreach my $letter (@
{$self->{seq
}}) {
487 $consensus .= $letter;
496 Function: Returns the length of the sites in used to make this matrix
504 my $width=@
{$self->{probA
}};
512 Function: Get/set the number of sites that were used to make this matrix
514 Args : none to get, int to set
520 if (@_) { $self->{sites
} = shift }
521 return $self->{sites
} || return;
528 Function: Returns IUPAC compliant consensus
530 Args : optionally, also supply a whole number (int) of 1 or higher to set
531 the significance level when considering the frequencies. 1 (the
532 default) means a 0.05 significance level: frequencies lower than
533 0.05 will be ignored. 2 Means a 0.005 level, and so on.
538 my ($self, $thresh) = @_;
540 my $len=$#{$self->{probA}};
541 for (my $i=0; $i<$len+1; $i++) {
542 (${$self->{IUPAC
}}[$i],${$self->{IUPACp
}}[$i])=_to_IUPAC
(${$self->{probA
}}[$i], ${$self->{probC
}}[$i], ${$self->{probG
}}[$i], ${$self->{probT
}}[$i], $thresh);
545 my $iu=$self->{IUPAC
};
547 foreach my $let (@
{$iu}) {
557 Function: Converts a single position to IUPAC compliant symbol.
558 For rules see the implementation
559 Returns : char, real number
560 Args : real numbers for frequencies of A,C,G,T (positional)
562 optionally, also supply a whole number (int) of 1 or higher to set
563 the significance level when considering the frequencies. 1 (the
564 default) means a 0.05 significance level: frequencies lower than
565 0.05 will be ignored. 2 Means a 0.005 level, and so on.
570 my ($a, $c, $g, $t, $thresh) = @_;
572 $thresh = int($thresh);
573 $a = sprintf ("%.${thresh}f", $a);
574 $c = sprintf ("%.${thresh}f", $c);
575 $g = sprintf ("%.${thresh}f", $g);
576 $t = sprintf ("%.${thresh}f", $t);
578 my $total = $a + $c + $g + $t;
580 return 'A' if ($a == $total);
581 return 'G' if ($g == $total);
582 return 'C' if ($c == $total);
583 return 'T' if ($t == $total);
585 return 'R' if ($r == $total);
587 return 'Y' if ($y == $total);
589 return 'M' if ($m == $total);
591 return 'K' if ($k == $total);
593 return 'S' if ($s == $total);
595 return 'W' if ($w == $total);
597 return 'D' if ($d == $total);
599 return 'V' if ($v == $total);
601 return 'B' if ($b == $total);
603 return 'H' if ($h == $total);
611 Function: Converts a single position to simple consensus character and returns
612 its probability. For rules see the implementation
613 Returns : char, real number
614 Args : real numbers for A,C,G,T (positional), and optional 5th argument of
615 threshold (as a number between 1 and 10, where 5 is default and
616 means the returned character had a 50% or higher presence at this
622 my ($A, $C, $G, $T, $thresh) = @_;
625 # this multiplication by 10 is just to satisfy the thresh range of 1-10
631 return 'N',10 if (($a<$thresh) && ($c<$thresh) && ($g<$thresh) && ($t<$thresh));
632 return 'N',10 if (($a==$t) && ($a==$c) && ($a==$g));
634 # threshold could be lower than 50%, so must check is not only over
635 # threshold, but also the highest frequency
636 return 'A',$a if (($a>=$thresh) && ($a>$t) && ($a>$c) && ($a>$g));
637 return 'C',$c if (($c>=$thresh) && ($c>$t) && ($c>$a) && ($c>$g));
638 return 'G',$g if (($g>=$thresh) && ($g>$t) && ($g>$c) && ($g>$a));
639 return 'T',$t if (($t>=$thresh) && ($t>$g) && ($t>$c) && ($t>$a));
648 Function: Returns given probability vector as a string. Useful if you want to
649 store things in a rel database, where arrays are not first choice
650 Throws : If the argument is outside {A,C,G,T}
652 Args : character {A,C,G,T}
663 if ($base eq 'A') {@prob= @
{$self->{probA
}}; last BASE
; }
664 if ($base eq 'C') {@prob= @
{$self->{probC
}}; last BASE
; }
665 if ($base eq 'G') {@prob= @
{$self->{probG
}}; last BASE
; }
666 if ($base eq 'T') {@prob= @
{$self->{probT
}}; last BASE
; }
667 $self->throw ("No such base: $base!\n");
670 foreach my $prob (@prob) {
671 my $corrected = $prob*10;
672 my $next=sprintf("%.0f",$corrected);
673 $next='a' if ($next eq '10');
683 Function: Returns an array with frequencies for a specified base
692 return @
{$self->{probA
}} if ($base eq 'A');
693 return @
{$self->{probC
}} if ($base eq 'C');
694 return @
{$self->{probG
}} if ($base eq 'G');
695 return @
{$self->{probT
}} if ($base eq 'T');
696 $self->throw("No such base: $base!\n");
699 =head2 get_logs_array
701 Title : get_logs_array
703 Function: Returns an array with log_odds for a specified base
712 return @
{$self->{logA
}} if (($base eq 'A') && ($self->{logA
}));
713 return @
{$self->{logC
}} if (($base eq 'C') && ($self->{logC
}));
714 return @
{$self->{logG
}} if (($base eq 'G') && ($self->{logG
}));
715 return @
{$self->{logT
}} if (($base eq 'T') && ($self->{logT
}));
716 $self->throw ("No such base: $base!\n") if (!grep(/$base/,qw(A C G T)));
724 Function: Gets/sets the site id
732 my $prev = $self->{id
};
733 if (@_) { $self->{id
} = shift; }
741 Function: Returns a regular expression which matches the IUPAC convention.
742 N will match X, N, - and .
744 Args : none (works at the threshold last used for making the IUPAC string)
751 foreach my $letter (@
{$self->{IUPAC
}}) {
754 if ($letter eq 'A') { $reg='[Aa]'; last LETTER
; }
755 if ($letter eq 'C') { $reg='[Cc]'; last LETTER
; }
756 if ($letter eq 'G') { $reg='[Gg]'; last LETTER
; }
757 if ($letter eq 'T') { $reg='[Tt]'; last LETTER
; }
758 if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER
; }
759 if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER
; }
760 if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER
; }
761 if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER
; }
762 if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER
; }
763 if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER
; }
764 if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER
; }
765 if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER
; }
766 if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER
; }
767 if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER
; }
779 Function: Returns a regular expression which matches the IUPAC convention.
780 N will match X, N, - and .
782 Args : none (works at the threshold last used for making the IUPAC string)
783 To do : I have separated regexp and regexp_array, but
784 maybe they can be rewritten as one - just check what should be returned
791 foreach my $letter (@
{$self->{IUPAC
}}) {
794 if ($letter eq 'A') { $reg='[Aa]'; last LETTER
; }
795 if ($letter eq 'C') { $reg='[Cc]'; last LETTER
; }
796 if ($letter eq 'G') { $reg='[Gg]'; last LETTER
; }
797 if ($letter eq 'T') { $reg='[Tt]'; last LETTER
; }
798 if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER
; }
799 if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER
; }
800 if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER
; }
801 if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER
; }
802 if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER
; }
803 if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER
; }
804 if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER
; }
805 if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER
; }
806 if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER
; }
807 if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER
; }
816 =head2 _compress_array
818 Title : _compress_array
820 Function: Will compress an array of real signed numbers to a string (ie vector
821 of bytes) -127 to +127 for bi-directional(signed) and 0..255 for
824 Args : array reference, followed by an max value and direction (optional,
825 default 1-unsigned),1 unsigned, any other is signed.
829 sub _compress_array
{
830 my ($array,$lm,$direct)=@_;
832 return unless(($array) && ($lm));
833 $direct=1 unless ($direct);
834 my $k1= ($direct==1) ?
(255/$lm) : (127/$lm);
835 foreach my $c (@
{$array}) {
837 $c=-$lm if (($c<-$lm) && ($direct !=1));
838 $c=0 if (($c<0) && ($direct ==1));
839 my $byte=int($k1*$c);
840 $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
847 =head2 _uncompress_string
849 Title : _uncompress_string
851 Function: Will uncompress a string (vector of bytes) to create an array of
852 real signed numbers (opposite to_compress_array)
853 Returns : string, followed by an max value and
854 direction (optional, default 1-unsigned), 1 unsigned, any other is signed.
859 sub _uncompress_string
{
860 my ($str,$lm,$direct)=@_;
862 return unless(($str) && ($lm));
863 $direct=1 unless ($direct);
864 my $k1= ($direct==1) ?
(255/$lm) : (127/$lm);
865 foreach my $c (split(//,$str)) {
867 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
874 =head2 get_compressed_freq
876 Title : get_compressed_freq
878 Function: A method to provide a compressed frequency vector. It uses one byte
879 to code the frequence for one of the probability vectors for one
880 position. Useful for relational database. Improvement of the previous
882 Example : my $strA=$self->get_compressed_freq('A');
888 sub get_compressed_freq
{
895 @prob= @
{$self->{probA
}} unless (!defined($self->{probA
}));
899 @prob= @
{$self->{probG
}} unless (!defined($self->{probG
}));
903 @prob= @
{$self->{probC
}} unless (!defined($self->{probC
}));
907 @prob= @
{$self->{probT
}} unless (!defined($self->{probT
}));
910 $self->throw ("No such base: $base!\n");
912 my $str= _compress_array
(\
@prob,1,1);
916 =head2 get_compressed_logs
918 Title : get_compressed_logs
920 Function: A method to provide a compressed log-odd vector. It uses one byte to
921 code the log value for one of the log-odds vectors for one position.
922 Example : my $strA=$self->get_compressed_logs('A');
928 sub get_compressed_logs
{
934 if ($base eq 'A') {@prob= @
{$self->{logA
}} unless (!defined($self->{logA
})); last BASE
; }
935 if ($base eq 'C') {@prob= @
{$self->{logC
}} unless (!defined($self->{logC
})); last BASE
; }
936 if ($base eq 'G') {@prob= @
{$self->{logG
}} unless (!defined($self->{logG
})); last BASE
; }
937 if ($base eq 'T') {@prob= @
{$self->{logT
}} unless (!defined($self->{logT
})); last BASE
; }
938 $self->throw ("No such base: $base!\n");
940 return _compress_array
(\
@prob,1000,2);
943 =head2 sequence_match_weight
945 Title : sequence_match_weight
947 Function: This method will calculate the score of a match, based on the PWM
948 if such is associated with the matrix object. Returns undef if no
949 PWM data is available.
950 Throws : if the length of the sequence is different from the matrix width
951 Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
952 Returns : Floating point
957 sub sequence_match_weight
{
959 return unless ($self->{logA
});
960 my $width=$self->width;
961 $self->throw ("I can calculate the score only for sequence which are exactly my size for $seq, my width is $width\n") unless (length($seq)==@
{$self->{logA
}});
963 my @seq=split(//,$seq);
966 foreach my $pos (@seq) {
968 $self->warn("Position ".($i+1)." of input sequence has unknown (ambiguity?) character '$pos': scores will be wrong") unless defined $self->{$tv};
969 $score += defined $self->{$tv} ?
$self->{$tv}->[$i] : 0;
975 =head2 get_all_vectors
977 Title : get_all_vectors
979 Function: returns all possible sequence vectors to satisfy the PFM under
981 Throws : If threshold outside of 0..1 (no sense to do that)
982 Example : my @vectors=$self->get_all_vectors(4);
983 Returns : Array of strings
984 Args : (optional) floating
988 sub get_all_vectors
{
991 $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
992 my @seq=split(//,$self->consensus($thresh*10));
994 for my $i (0..@
{$self->{probA
}}) {
995 push @
{$perm[$i]},'A' if ($self->{probA
}->[$i]>$thresh);
996 push @
{$perm[$i]},'C' if ($self->{probC
}->[$i]>$thresh);
997 push @
{$perm[$i]},'G' if ($self->{probG
}->[$i]>$thresh);
998 push @
{$perm[$i]},'T' if ($self->{probT
}->[$i]>$thresh);
999 push @
{$perm[$i]},'N' if ($seq[$i] eq 'N');
1001 my $fpos=shift @perm;
1003 foreach my $pos (@perm) {
1005 foreach my $let (@
$pos) {
1006 foreach my $string (@strings) {
1007 my $newstring = $string . $let;
1008 push @newstr,$newstring;