Bio/Matrix/PSM/SiteMatrix.pm

   1 #---------------------------------------------------------
   2
   3 =head1 NAME
   4
   5 Bio::Matrix::PSM::SiteMatrix - SiteMatrixI implementation, holds a
   6 position scoring matrix (or position weight matrix) and log-odds
   7
   8 =head1 SYNOPSIS
   9
  10   use Bio::Matrix::PSM::SiteMatrix;
  11   # Create from memory by supplying probability matrix hash
  12   # both as strings or arrays
  13   # where the frequencies  $a,$c,$g and $t are supplied either as
  14   # arrayref or string. Accordingly, lA, lC, lG and lT are the log
  15   # odds (only as arrays, no checks done right now)
  16   my ($a,$c,$g,$t,$score,$ic, $mid)=@_;
  17   #or
  18   my ($a,$c,$g,$t,$score,$ic,$mid)=('05a011','110550','400001',
  19                                     '100104',0.001,19.2,'CRE1');
  20   #Where a stands for all (this frequency=1), see explanation below
  21   my %param=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,
  22              -lA=>$la, -lC=>$lc,-lG=>$lg,-lT=>$l,
  23              -IC=>$ic,-e_val=>$score, -id=>$mid);
  24   my $site=Bio::Matrix::PSM::SiteMatrix->new(%param);
  25   #Or get it from a file:
  26   use Bio::Matrix::PSM::IO;
  27   my $psmIO= Bio::Matrix::PSM::IO->new(-file=>$file, -format=>'transfac');
  28   while (my $psm=$psmIO->next_psm) {
  29     #Now we have a Bio::Matrix::PSM::Psm object,
  30     # see Bio::Matrix::PSM::PsmI for details
  31     #This is a Bio::Matrix::PSM::SiteMatrix object now
  32     my $matrix=$psm->matrix;
  33   }
  34
  35   # Get a simple consensus, where alphabet is {A,C,G,T,N},
  36   # choosing the character that both satisfies a supplied or default threshold
  37   # frequency and is the most frequenct character at each position, or N.
  38   # So for the position with A, C, G, T frequencies of 0.5, 0.25, 0.10, 0.15,
  39   # the simple consensus character will be 'A', whilst for 0.5, 0.5, 0, 0 it
  40   # would be 'N'.
  41   my $consensus=$site->consensus;
  42
  43   # Get the IUPAC ambiguity code representation of the data in the matrix.
  44   # Because the frequencies may have been pseudo-count corrected, insignificant
  45   # frequences (below 0.05 by default) are ignored. So a position with
  46   # A, C, G, T frequencies of 0.5, 0.5, 0.01, 0.01 will get the IUPAC code 'M',
  47   # while 0.97, 0.01, 0.01, 0.01 will get the code 'A' and
  48   # 0.25, 0.25, 0.25, 0.25 would get 'N'.
  49   my $iupac=$site->IUPAC;
  50
  51   # Getting/using regular expression (a representation of the IUPAC string)
  52   my $regexp=$site->regexp;
  53   my $count=grep($regexp,$seq);
  54   my $count=($seq=~ s/$regexp/$1/eg);
  55   print "Motif $mid is present $count times in this sequence\n";
  56
  57 =head1 DESCRIPTION
  58
  59 SiteMatrix is designed to provide some basic methods when working with position
  60 scoring (weight) matrices, such as transcription factor binding sites for
  61 example. A DNA PSM consists of four vectors with frequencies {A,C,G,T}. This is
  62 the minimum information you should provide to construct a PSM object. The
  63 vectors can be provided as strings with frequenciesx10 rounded to an int, going
  64 from {0..a} and 'a' represents the maximum (10). This is like MEME's compressed
  65 representation of a matrix and it is quite useful when working with relational
  66 DB. If arrays are provided as an input (references to arrays actually) they can
  67 be any number, real or integer (frequency or count).
  68
  69 When creating the object you can ask the constructor to make a simple pseudo
  70 count correction by adding a number (typically 1) to all positions (with the
  71 -correction option). After adding the number the frequencies will be
  72 calculated. Only use correction when you supply counts, not frequencies.
  73
  74 Throws an exception if: You mix as an input array and string (for example A
  75 matrix is given as array, C - as string). The position vector is (0,0,0,0). One
  76 of the probability vectors is shorter than the rest.
  77
  78 Summary of the methods I use most frequently (details below):
  79
  80   iupac - return IUPAC compliant consensus as a string
  81   score - Returns the score as a real number
  82   IC - information content. Returns a real number
  83   id - identifier. Returns a string
  84   accession - accession number. Returns a string
  85   next_pos - return the sequence probably for each letter, IUPAC
  86       symbol, IUPAC probability and simple sequence
  87   consenus letter for this position. Rewind at the end. Returns a hash.
  88   pos - current position get/set. Returns an integer.
  89   regexp - construct a regular expression based on IUPAC consensus.
  90       For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
  91   width - site width
  92   get_string - gets the probability vector for a single base as a string.
  93   get_array - gets the probability vector for a single base as an array.
  94   get_logs_array - gets the log-odds vector for a single base as an array.
  95
  96 New methods, which might be of interest to anyone who wants to store
  97 PSM in a relational database without creating an entry for each
  98 position is the ability to compress the PSM vector into a string with
  99 losing usually less than 1% of the data.  this can be done with:
 100
 101   my $str=$matrix->get_compressed_freq('A');
 102 or
 103   my $str=$matrix->get_compressed_logs('A');
 104
 105 Loading from a database should be done with new, but is not yest implemented.
 106 However you can still uncompress such string with:
 107
 108   my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
 109 or
 110   my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
 111
 112 =head1 FEEDBACK
 113
 114 =head2 Mailing Lists
 115
 116 User feedback is an integral part of the evolution of this and other
 117 Bioperl modules. Send your comments and suggestions preferably to one
 118 of the Bioperl mailing lists.  Your participation is much appreciated.
 119
 120   bioperl-l@bioperl.org                  - General discussion
 121   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 122
 123 =head2 Support
 124
 125 Please direct usage questions or support issues to the mailing list:
 126
 127 I<bioperl-l@bioperl.org>
 128
 129 rather than to the module maintainer directly. Many experienced and
 130 reponsive experts will be able look at the problem and quickly
 131 address it. Please include a thorough description of the problem
 132 with code and data examples if at all possible.
 133
 134 =head2 Reporting Bugs
 135
 136 Report bugs to the Bioperl bug tracking system to help us keep track
 137 the bugs and their resolution.  Bug reports can be submitted via the
 138 web:
 139
 140   https://github.com/bioperl/bioperl-live/issues
 141
 142 =head1 AUTHOR - Stefan Kirov
 143
 144 Email skirov@utk.edu
 145
 146 =head1 CONTRIBUTORS
 147
 148 Sendu Bala, bix@sendu.me.uk
 149
 150 =head1 APPENDIX
 151
 152 The rest of the documentation details each of the object methods.
 153 Internal methods are usually preceded with a _
 154
 155 =cut
 156
 157 # Let the code begin...
 158 package Bio::Matrix::PSM::SiteMatrix;
 159 use strict;
 160
 161 use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
 162
 163 =head2 new
 164
 165  Title   : new
 166  Usage   : my $site=Bio::Matrix::PSM::SiteMatrix->new(-pA=>$a,-pC=>$c,
 167                                                      -pG=>$g,-pT=>$t,
 168                                                      -IC=>$ic,
 169                                                      -e_val=>$score,
 170                                                      -id=>$mid);
 171  Function: Creates a new Bio::Matrix::PSM::SiteMatrix object from memory
 172  Throws :  If inconsistent data for all vectors (A,C,G and T) is
 173            provided, if you mix input types (string vs array) or if a
 174            position freq is 0.
 175  Returns :  Bio::Matrix::PSM::SiteMatrix object
 176  Args    :  -pA    => vector with the frequencies or counts of A
 177             -pC    => vector for C
 178             -pG    => vector for G
 179             -pt    => vector for T
 180             -lA    => vector for the log of A
 181             -lC    => vector for the log of C
 182             -lG    => vector for the log of G
 183             -lT    => vector for the log of T
 184             -IC    => real number, the information content of this matrix
 185             -e_val => real number, the expect value
 186             -id    => string, an identifier
 187             -width => int, width of the matrix in nucleotides
 188             -sites => int, the number of sites that went into this matrix
 189             -model => hash ref, background frequencies for A, C, G and T
 190             -correction => number, the number to add to all positions to achieve
 191                            pseudo count correction (default 0: no correction)
 192                            NB: do not use correction when your input is
 193                            frequences!
 194             -accession_number => string, an accession number
 195
 196             Vectors can be strings of the frequencies where the frequencies are
 197             multiplied by 10 and rounded to the nearest whole number, and where
 198             'a' is used to denote the maximal frequency 10. There should be no
 199             punctuation (spaces etc.) in the string. For example, 'a0501'.
 200             Alternatively frequencies or counts can be represented by an array
 201             ref containing the counts, frequencies or logs as any kind of
 202             number.
 203
 204 =cut
 205
 206 sub new {
 207     my ($class, @args) = @_;
 208     my $self = $class->SUPER::new(@args);
 209     my $consensus;
 210     # Too many things to rearrange, and I am creating simultanuously >500
 211     # such objects routinely, so this becomes performance issue
 212     my %input;
 213     while (@args) {
 214         (my $key = shift @args) =~ s/-//g; #deletes all dashes (only dashes)!
 215         $input{$key} = shift @args;
 216     }
 217     $self->{_position}   = 0;
 218     $self->{IC}     = $input{IC};
 219     $self->{e_val}  = $input{e_val};
 220     $self->{width}  = $input{width};
 221         $self->{logA}   = $input{lA};
 222         $self->{logC}   = $input{lC};
 223         $self->{logG}   = $input{lG};
 224         $self->{logT}   = $input{lT};
 225     $self->{sites}  = $input{sites};
 226     $self->{id}     = $input{id} || 'null';
 227     $self->{correction} = $input{correction} || 0;
 228     $self->{accession_number} = $input{accession_number};
 229         return $self unless (defined($input{pA}) && defined($input{pC}) && defined($input{pG}) && defined($input{pT}));
 230
 231     # This should go to _initialize?
 232     # Check for input type- no mixing alllowed, throw ex
 233     if (ref($input{pA}) =~ /ARRAY/i ) {
 234         $self->throw("Mixing matrix data types not allowed: C is not reference") unless(ref($input{pC}));
 235         $self->throw("Mixing matrix data types not allowed: G is not reference") unless (ref($input{pG}));
 236         $self->throw("Mixing matrix data types not allowed: T is not reference") unless (ref($input{pT}));
 237         $self->{probA} = $input{pA};
 238         $self->{probC} = $input{pC};
 239         $self->{probG} = $input{pG};
 240         $self->{probT} = $input{pT};
 241     }
 242     else {
 243         $self->throw("Mixing matrix data types not allowed: C is reference") if (ref($input{pC}));
 244         $self->throw("Mixing matrix data types not allowed: G is reference") if (ref($input{pG}));
 245         $self->throw("Mixing matrix data types not allowed: T is reference") if (ref($input{pT}));
 246         $self->{probA} = [split(//,$input{pA})];
 247         $self->{probC} = [split(//,$input{pC})];
 248         $self->{probG} = [split(//,$input{pG})];
 249         $self->{probT} = [split(//,$input{pT})];
 250         for (my $i=0; $i<= @{$self->{probA}}+1; $i++) {
 251             # we implictely assume these are MEME-style frequencies x 10, so
 252             # 'a' represents the 'maximum', 10. Other positions can actually
 253             # add up to over 10 due to rounding, but I don't think that is a
 254             # problem?
 255             if (${$self->{probA}}[$i] and ${$self->{probA}}[$i] eq 'a') {
 256                 ${$self->{probA}}[$i]='10';
 257             }
 258             if (${$self->{probC}}[$i] and ${$self->{probC}}[$i] eq 'a') {
 259                 ${$self->{probC}}[$i]='10';
 260             }
 261             if (${$self->{probG}}[$i] and ${$self->{probG}}[$i] eq 'a') {
 262                 ${$self->{probG}}[$i]='10';
 263             }
 264             if (${$self->{probT}}[$i] and ${$self->{probT}}[$i] eq 'a') {
 265                 ${$self->{probT}}[$i]='10';
 266             }
 267         }
 268     }
 269
 270     # Check for position with 0 for all bases, throw exception if so
 271     for (my $i=0;$i <= $#{$self->{probA}}; $i++) {
 272         if ((${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]) == 0) {
 273             $self->throw("Position meaningless-all frequencies are 0");
 274         }
 275
 276         # apply pseudo-count correction to all values - this will result in
 277         # very bad frequencies if the input is already frequences and a
 278         # correction value as large as 1 is used!
 279         if ($self->{correction}) {
 280             ${$self->{probA}}[$i] += $self->{correction};
 281             ${$self->{probC}}[$i] += $self->{correction};
 282             ${$self->{probG}}[$i] += $self->{correction};
 283             ${$self->{probT}}[$i] += $self->{correction};
 284         }
 285
 286         # (re)calculate frequencies
 287         my $div= ${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i];
 288         ${$self->{probA}}[$i]=${$self->{probA}}[$i]/$div;
 289         ${$self->{probC}}[$i]=${$self->{probC}}[$i]/$div;
 290         ${$self->{probG}}[$i]=${$self->{probG}}[$i]/$div;
 291         ${$self->{probT}}[$i]=${$self->{probT}}[$i]/$div;
 292     }
 293
 294     # Calculate the logs
 295     if ((!defined($self->{logA})) && ($input{model})) {
 296         $self->calc_weight($input{model});
 297     }
 298
 299     # Make consensus, throw if any one of the vectors is shorter
 300     $self->_calculate_consensus;
 301     return $self;
 302 }
 303
 304 =head2 _calculate_consensus
 305
 306  Title   : _calculate_consensus
 307  Function: Internal stuff
 308
 309 =cut
 310
 311 sub _calculate_consensus {
 312     my $self=shift;
 313     my ($lc,$lt,$lg)=($#{$self->{probC}},$#{$self->{probT}},$#{$self->{probG}});
 314     my $len=$#{$self->{probA}};
 315     $self->throw("Probability matrix is damaged for C: $len vs $lc") if ($len != $lc);
 316     $self->throw("Probability matrix is damaged for T: $len vs $lt") if ($len != $lt);
 317     $self->throw("Probability matrix is damaged for G: $len vs $lg") if ($len != $lg);
 318     for (my $i=0; $i<$len+1; $i++) {
 319         #*** IUPACp values not actually used (eg. by next_pos)
 320         (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
 321         (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
 322     }
 323     return $self;
 324 }
 325
 326 =head2 calc_weight
 327
 328  Title   : calc_weight
 329  Usage   : $obj->calc_weight({A=>0.2562, C=>0.2438, G=>0.2432, T=>0.2568});
 330  Function: Recalculates the PSM (or weights) based on the PFM (the frequency
 331            matrix) and user supplied background model.
 332  Throws  : if no model is supplied
 333  Returns : n/a
 334  Args    : reference to a hash with background frequencies for A,C,G and T
 335
 336 =cut
 337
 338 sub calc_weight {
 339     my ($self, $model) = @_;
 340     my %model;
 341     $model{probA}=$model->{A};
 342     $model{probC}=$model->{C};
 343     $model{probG}=$model->{G};
 344     $model{probT}=$model->{T};
 345     foreach my $let (qw(probA probC probG probT)) {
 346       my @str;
 347       $self->throw('You did not provide valid model\n') unless (($model{$let}>0) && ($model{$let}<1));
 348       foreach my $f (@{$self->{$let}}) {
 349         my $w=log($f)-log($model{$let});
 350         push @str,$w;
 351       }
 352       my $llet=$let;
 353       $llet=~s/prob/log/;
 354       $self->{$llet}=\@str;
 355     }
 356     return $self;
 357 }
 358
 359 =head2 next_pos
 360
 361  Title   : next_pos
 362  Usage   :
 363  Function: Retrieves the next position features: frequencies for A,C,G,T, the
 364            main letter (as in consensus) and the probabilty for this letter to
 365            occur at this position and the current position
 366  Returns : hash (pA,pC,pG,pT,logA,logC,logG,logT,base,prob,rel)
 367  Args    : none
 368
 369 =cut
 370
 371 sub next_pos {
 372     my $self = shift;
 373     $self->throw("instance method called on class") unless ref $self;
 374     my $len=@{$self->{seq}};
 375     my $pos=$self->{_position};
 376     # End reached?
 377     if ($pos<$len) {
 378         my $pA=${$self->{probA}}[$pos];
 379         my $pC=${$self->{probC}}[$pos];
 380         my $pG=${$self->{probG}}[$pos];
 381         my $pT=${$self->{probT}}[$pos];
 382         my $lA=${$self->{logA}}[$pos];
 383         my $lC=${$self->{logC}}[$pos];
 384         my $lG=${$self->{logG}}[$pos];
 385         my $lT=${$self->{logT}}[$pos];
 386         my $base=${$self->{seq}}[$pos];
 387         my $prob=${$self->{seqp}}[$pos];
 388         $self->{_position}++;
 389         my %seq=(pA=>$pA,pT=>$pT,pC=>$pC,pG=>$pG, lA=>$lA,lT=>$lT,lC=>$lC,lG=>$lG,base=>$base,rel=>$pos, prob=>$prob);
 390         return %seq;
 391     }
 392     else {$self->{_position}=0; return;}
 393 }
 394
 395 =head2 curpos
 396
 397  Title   : curpos
 398  Usage   :
 399  Function: Gets/sets the current position. Converts to 0 if argument is minus
 400            and to width if greater than width
 401  Returns : integer
 402  Args    : integer
 403
 404 =cut
 405
 406 sub curpos {
 407     my $self = shift;
 408     my $prev = $self->{_position};
 409     if (@_) { $self->{_position} = shift; }
 410     return $prev;
 411 }
 412
 413 =head2 e_val
 414
 415  Title   : e_val
 416  Usage   :
 417  Function: Gets/sets the e-value
 418  Returns : real number
 419  Args    : none to get, real number to set
 420
 421 =cut
 422
 423 sub e_val {
 424     my $self = shift;
 425     my $prev = $self->{e_val};
 426     if (@_) { $self->{e_val} = shift; }
 427     return $prev;
 428 }
 429
 430 =head2 IC
 431
 432  Title   : IC
 433  Usage   :
 434  Function: Get/set the Information Content
 435  Returns : real number
 436  Args    : none to get, real number to set
 437
 438 =cut
 439
 440 sub IC {
 441     my $self = shift;
 442     my $prev = $self->{IC};
 443     if (@_) { $self->{IC} = shift; }
 444     return $prev;
 445 }
 446
 447 =head2 accession_number
 448
 449  Title   : accession_number
 450  Function: Get/set the accession number, this will be unique id for the
 451            SiteMatrix object as well for any other object, inheriting from
 452            SiteMatrix
 453  Returns : string
 454  Args    : none to get, string to set
 455
 456 =cut
 457
 458 sub accession_number {
 459     my $self = shift;
 460     my $prev = $self->{accession_number};
 461     if (@_) { $self->{accession_number} = shift; }
 462     return $prev;
 463 }
 464
 465 =head2 consensus
 466
 467  Title   : consensus
 468  Usage   :
 469  Function: Returns the consensus
 470  Returns : string
 471  Args    : (optional) threshold value 1 to 10, default 5
 472            '5' means the returned characters had a 50% or higher presence at
 473            their position
 474
 475 =cut
 476
 477 sub consensus {
 478     my ($self, $thresh) = @_;
 479     if ($thresh) {
 480         my $len=$#{$self->{probA}};
 481         for (my $i=0; $i<$len+1; $i++) {
 482             (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
 483         }
 484     }
 485     my $consensus='';
 486     foreach my $letter (@{$self->{seq}}) {
 487         $consensus .= $letter;
 488     }
 489     return $consensus;
 490 }
 491
 492 =head2 width
 493
 494  Title   : width
 495  Usage   :
 496  Function: Returns the length of the sites in used to make this matrix
 497  Returns : int
 498  Args    : none
 499
 500 =cut
 501
 502 sub width {
 503     my $self = shift;
 504     my $width=@{$self->{probA}};
 505     return $width;
 506 }
 507
 508 =head2 sites
 509
 510  Title   : sites
 511  Usage   :
 512  Function: Get/set the number of sites that were used to make this matrix
 513  Returns : int
 514  Args    : none to get, int to set
 515
 516 =cut
 517
 518 sub sites {
 519     my $self = shift;
 520     if (@_) { $self->{sites} = shift }
 521     return $self->{sites} || return;
 522 }
 523
 524 =head2 IUPAC
 525
 526  Title   : IUPAC
 527  Usage   :
 528  Function: Returns IUPAC compliant consensus
 529  Returns : string
 530  Args    : optionally, also supply a whole number (int) of 1 or higher to set
 531            the significance level when considering the frequencies. 1 (the
 532            default) means a 0.05 significance level: frequencies lower than
 533            0.05 will be ignored. 2 Means a 0.005 level, and so on.
 534
 535 =cut
 536
 537 sub IUPAC {
 538         my ($self, $thresh) = @_;
 539     if ($thresh) {
 540         my $len=$#{$self->{probA}};
 541         for (my $i=0; $i<$len+1; $i++) {
 542             (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
 543         }
 544     }
 545         my $iu=$self->{IUPAC};
 546         my $iupac='';
 547         foreach my $let (@{$iu}) {
 548                 $iupac .= $let;
 549         }
 550     return $iupac;
 551 }
 552
 553 =head2 _to_IUPAC
 554
 555  Title   : _to_IUPAC
 556  Usage   :
 557  Function: Converts a single position to IUPAC compliant symbol.
 558            For rules see the implementation
 559  Returns : char, real number
 560  Args    : real numbers for frequencies of A,C,G,T (positional)
 561
 562            optionally, also supply a whole number (int) of 1 or higher to set
 563            the significance level when considering the frequencies. 1 (the
 564            default) means a 0.05 significance level: frequencies lower than
 565            0.05 will be ignored. 2 Means a 0.005 level, and so on.
 566
 567 =cut
 568
 569 sub _to_IUPAC {
 570         my ($a, $c, $g, $t, $thresh) = @_;
 571     $thresh ||= 1;
 572     $thresh = int($thresh);
 573     $a = sprintf ("%.${thresh}f", $a);
 574     $c = sprintf ("%.${thresh}f", $c);
 575     $g = sprintf ("%.${thresh}f", $g);
 576     $t = sprintf ("%.${thresh}f", $t);
 577
 578     my $total = $a + $c + $g + $t;
 579
 580         return 'A' if ($a == $total);
 581         return 'G' if ($g == $total);
 582         return 'C' if ($c == $total);
 583         return 'T' if ($t == $total);
 584         my $r=$g+$a;
 585         return 'R' if ($r == $total);
 586         my $y=$t+$c;
 587         return 'Y' if ($y == $total);
 588         my $m=$a+$c;
 589         return 'M' if ($m == $total);
 590         my $k=$g+$t;
 591         return 'K' if ($k == $total);
 592         my $s=$g+$c;
 593         return 'S' if ($s == $total);
 594         my $w=$a+$t;
 595         return 'W' if ($w == $total);
 596         my $d=$r+$t;
 597         return 'D' if ($d == $total);
 598         my $v=$r+$c;
 599         return 'V' if ($v == $total);
 600         my $b=$y+$g;
 601         return 'B' if ($b == $total);
 602         my $h=$y+$a;
 603         return 'H' if ($h == $total);
 604         return 'N';
 605 }
 606
 607 =head2 _to_cons
 608
 609  Title   : _to_cons
 610  Usage   :
 611  Function: Converts a single position to simple consensus character and returns
 612            its probability. For rules see the implementation
 613  Returns : char, real number
 614  Args    : real numbers for A,C,G,T (positional), and optional 5th argument of
 615            threshold (as a number between 1 and 10, where 5 is default and
 616            means the returned character had a 50% or higher presence at this
 617            position)
 618
 619 =cut
 620
 621 sub _to_cons {
 622         my ($A, $C, $G, $T, $thresh) = @_;
 623     $thresh ||= 5;
 624
 625     # this multiplication by 10 is just to satisfy the thresh range of 1-10
 626         my $a = $A * 10;
 627         my $c = $C * 10;
 628         my $g = $G * 10;
 629         my $t = $T * 10;
 630
 631     return 'N',10 if (($a<$thresh) && ($c<$thresh) && ($g<$thresh) && ($t<$thresh));
 632         return 'N',10 if (($a==$t) && ($a==$c) && ($a==$g));
 633
 634     # threshold could be lower than 50%, so must check is not only over
 635     # threshold, but also the highest frequency
 636         return 'A',$a if (($a>=$thresh) && ($a>$t) && ($a>$c) && ($a>$g));
 637         return 'C',$c if (($c>=$thresh) && ($c>$t) && ($c>$a) && ($c>$g));
 638         return 'G',$g if (($g>=$thresh) && ($g>$t) && ($g>$c) && ($g>$a));
 639         return 'T',$t if (($t>=$thresh) && ($t>$g) && ($t>$c) && ($t>$a));
 640
 641     return 'N',10;
 642 }
 643
 644 =head2 get_string
 645
 646  Title   : get_string
 647  Usage   :
 648  Function: Returns given probability vector as a string. Useful if you want to
 649            store things in a rel database, where arrays are not first choice
 650  Throws  : If the argument is outside {A,C,G,T}
 651  Returns : string
 652  Args    : character {A,C,G,T}
 653
 654 =cut
 655
 656 sub get_string {
 657         my $self=shift;
 658         my $base=shift;
 659         my $string='';
 660         my @prob;
 661
 662         BASE: {
 663                 if ($base eq 'A') {@prob= @{$self->{probA}}; last BASE; }
 664                 if ($base eq 'C') {@prob= @{$self->{probC}}; last BASE; }
 665                 if ($base eq 'G') {@prob= @{$self->{probG}}; last BASE; }
 666                 if ($base eq 'T') {@prob= @{$self->{probT}}; last BASE; }
 667                 $self->throw ("No such base: $base!\n");
 668         }
 669
 670     foreach  my $prob (@prob) {
 671         my $corrected = $prob*10;
 672         my $next=sprintf("%.0f",$corrected);
 673         $next='a' if ($next eq '10');
 674         $string .= $next;
 675     }
 676     return $string;
 677 }
 678
 679 =head2 get_array
 680
 681  Title   : get_array
 682  Usage   :
 683  Function: Returns an array with frequencies for a specified base
 684  Returns : array
 685  Args    : char
 686
 687 =cut
 688
 689 sub get_array {
 690         my $self=shift;
 691         my $base=uc(shift);
 692         return  @{$self->{probA}} if ($base eq 'A');
 693         return  @{$self->{probC}} if ($base eq 'C');
 694         return  @{$self->{probG}} if ($base eq 'G');
 695         return  @{$self->{probT}} if ($base eq 'T');
 696         $self->throw("No such base: $base!\n");
 697 }
 698
 699 =head2 get_logs_array
 700
 701  Title   : get_logs_array
 702  Usage   :
 703  Function: Returns an array with log_odds for a specified base
 704  Returns : array
 705  Args    : char
 706
 707 =cut
 708
 709 sub get_logs_array {
 710         my $self=shift;
 711         my $base=uc(shift);
 712         return  @{$self->{logA}} if (($base eq 'A')  && ($self->{logA}));
 713         return  @{$self->{logC}} if (($base eq 'C')  && ($self->{logC}));
 714         return  @{$self->{logG}} if (($base eq 'G')  && ($self->{logG}));
 715         return  @{$self->{logT}} if (($base eq 'T')  && ($self->{logT}));
 716         $self->throw ("No such base: $base!\n") if (!grep(/$base/,qw(A C G T)));
 717     return;
 718 }
 719
 720 =head2 id
 721
 722  Title   : id
 723  Usage   :
 724  Function: Gets/sets the site id
 725  Returns : string
 726  Args    : string
 727
 728 =cut
 729
 730 sub id {
 731     my $self = shift;
 732     my $prev = $self->{id};
 733     if (@_) { $self->{id} = shift; }
 734     return $prev;
 735 }
 736
 737 =head2 regexp
 738
 739  Title   : regexp
 740  Usage   :
 741  Function: Returns a regular expression which matches the IUPAC convention.
 742            N will match X, N, - and .
 743  Returns : string
 744  Args    : none (works at the threshold last used for making the IUPAC string)
 745
 746 =cut
 747
 748 sub regexp {
 749         my $self=shift;
 750         my $regexp;
 751         foreach my $letter (@{$self->{IUPAC}}) {
 752                 my $reg;
 753                 LETTER: {
 754                         if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
 755                         if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
 756                         if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
 757                         if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
 758                         if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
 759                         if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
 760                         if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
 761                         if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
 762                         if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
 763                         if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
 764                         if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
 765                         if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
 766                         if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
 767                         if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
 768                         $reg='\S';
 769                 }
 770                 $regexp .= $reg;
 771         }
 772     return $regexp;
 773 }
 774
 775 =head2 regexp_array
 776
 777  Title   : regexp_array
 778  Usage   :
 779  Function: Returns a regular expression which matches the IUPAC convention.
 780            N will match X, N, - and .
 781  Returns : array
 782  Args    : none (works at the threshold last used for making the IUPAC string)
 783  To do   : I have separated regexp and regexp_array, but
 784            maybe they can be rewritten as one - just check what should be returned
 785
 786 =cut
 787
 788 sub regexp_array {
 789         my $self=shift;
 790         my @regexp;
 791         foreach my $letter (@{$self->{IUPAC}}) {
 792                 my $reg;
 793                 LETTER: {
 794                         if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
 795                         if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
 796                         if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
 797                         if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
 798                         if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
 799                         if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
 800                         if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
 801                         if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
 802                         if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
 803                         if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
 804                         if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
 805                         if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
 806                         if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
 807                         if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
 808                         $reg='\S';
 809                 }
 810                 push @regexp,$reg;
 811         }
 812     return @regexp;
 813 }
 814
 815
 816 =head2 _compress_array
 817
 818  Title   : _compress_array
 819  Usage   :
 820  Function: Will compress an array of real signed numbers to a string (ie vector
 821            of bytes) -127 to +127 for bi-directional(signed) and 0..255 for
 822            unsigned
 823  Returns : String
 824  Args    : array reference, followed by an max value and direction (optional,
 825            default 1-unsigned),1 unsigned, any other is signed.
 826
 827 =cut
 828
 829 sub _compress_array {
 830         my ($array,$lm,$direct)=@_;
 831         my $str;
 832         return  unless(($array) && ($lm));
 833         $direct=1 unless ($direct);
 834         my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
 835         foreach my $c (@{$array}) {
 836                 $c=$lm if ($c>$lm);
 837                 $c=-$lm if (($c<-$lm) && ($direct !=1));
 838     $c=0 if (($c<0) && ($direct ==1));
 839                 my $byte=int($k1*$c);
 840     $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
 841     my $char=chr($byte);
 842                 $str.=$char;
 843         }
 844         return $str;
 845 }
 846
 847 =head2 _uncompress_string
 848
 849  Title   : _uncompress_string
 850  Usage   :
 851  Function: Will uncompress a string (vector of bytes) to create an array of
 852            real signed numbers (opposite to_compress_array)
 853  Returns : string, followed by an max value and
 854                    direction (optional, default 1-unsigned), 1 unsigned, any other is signed.
 855  Args    : array
 856
 857 =cut
 858
 859 sub _uncompress_string {
 860         my ($str,$lm,$direct)=@_;
 861         my @array;
 862         return unless(($str) && ($lm));
 863         $direct=1 unless ($direct);
 864         my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
 865         foreach my $c (split(//,$str)) {
 866                 my $byte=ord($c);
 867                 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
 868                 my $num=$byte/$k1;
 869                 push @array,$num;
 870         }
 871         return @array;
 872 }
 873
 874 =head2 get_compressed_freq
 875
 876  Title   : get_compressed_freq
 877  Usage   :
 878  Function: A method to provide a compressed frequency vector. It uses one byte
 879            to code the frequence for one of the probability vectors for one
 880            position. Useful for relational database. Improvement of the previous
 881            0..a coding.
 882  Example :  my $strA=$self->get_compressed_freq('A');
 883  Returns :  String
 884  Args    :  char
 885
 886 =cut
 887
 888 sub get_compressed_freq {
 889         my $self=shift;
 890         my $base=shift;
 891         my $string='';
 892         my @prob;
 893         BASE: {
 894                 if ($base eq 'A') {
 895       @prob= @{$self->{probA}} unless (!defined($self->{probA}));
 896       last BASE;
 897     }
 898                 if ($base eq 'G') {
 899       @prob= @{$self->{probG}} unless (!defined($self->{probG}));
 900       last BASE;
 901     }
 902                 if ($base eq 'C') {
 903       @prob= @{$self->{probC}} unless (!defined($self->{probC}));
 904       last BASE;
 905     }
 906                 if ($base eq 'T') {
 907       @prob= @{$self->{probT}} unless (!defined($self->{probT}));
 908       last BASE;
 909     }
 910                 $self->throw ("No such base: $base!\n");
 911         }
 912         my $str= _compress_array(\@prob,1,1);
 913     return $str;
 914 }
 915
 916 =head2 get_compressed_logs
 917
 918  Title   : get_compressed_logs
 919  Usage   :
 920  Function: A method to provide a compressed log-odd vector. It uses one byte to
 921                    code the log value for one of the log-odds vectors for one position.
 922  Example : my $strA=$self->get_compressed_logs('A');
 923  Returns : String
 924  Args    : char
 925
 926 =cut
 927
 928 sub get_compressed_logs {
 929         my $self=shift;
 930         my $base=shift;
 931         my $string='';
 932         my @prob;
 933         BASE: {
 934                 if ($base eq 'A') {@prob= @{$self->{logA}} unless (!defined($self->{logA})); last BASE; }
 935                 if ($base eq 'C') {@prob= @{$self->{logC}} unless (!defined($self->{logC})); last BASE; }
 936                 if ($base eq 'G') {@prob= @{$self->{logG}} unless (!defined($self->{logG})); last BASE; }
 937                 if ($base eq 'T') {@prob= @{$self->{logT}} unless (!defined($self->{logT})); last BASE; }
 938                 $self->throw ("No such base: $base!\n");
 939         }
 940         return _compress_array(\@prob,1000,2);
 941 }
 942
 943 =head2 sequence_match_weight
 944
 945  Title   : sequence_match_weight
 946  Usage   :
 947  Function: This method will calculate the score of a match, based on the PWM
 948            if such is associated with the matrix object. Returns undef if no
 949            PWM data is available.
 950  Throws  : if the length of the sequence is different from the matrix width
 951  Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
 952  Returns : Floating point
 953  Args    : string
 954
 955 =cut
 956
 957 sub sequence_match_weight {
 958     my ($self,$seq)=@_;
 959     return unless ($self->{logA});
 960     my $width=$self->width;
 961     $self->throw ("I can calculate the score only for sequence which are exactly my size for $seq, my width is $width\n") unless (length($seq)==@{$self->{logA}});
 962     $seq = uc($seq);
 963     my @seq=split(//,$seq);
 964     my $score = 0;
 965     my $i=0;
 966     foreach my $pos (@seq) {
 967         my $tv = 'log'.$pos;
 968         $self->warn("Position ".($i+1)." of input sequence has unknown (ambiguity?) character '$pos': scores will be wrong") unless defined $self->{$tv};
 969         $score += defined $self->{$tv} ? $self->{$tv}->[$i] : 0;
 970         $i++;
 971     }
 972     return $score;
 973 }
 974
 975 =head2 get_all_vectors
 976
 977  Title   : get_all_vectors
 978  Usage   :
 979  Function: returns all possible sequence vectors to satisfy the PFM under
 980            a given threshold
 981  Throws  : If threshold outside of 0..1 (no sense to do that)
 982  Example : my @vectors=$self->get_all_vectors(4);
 983  Returns : Array of strings
 984  Args    : (optional) floating
 985
 986 =cut
 987
 988 sub get_all_vectors {
 989         my $self=shift;
 990         my $thresh=shift;
 991     $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
 992     my @seq=split(//,$self->consensus($thresh*10));
 993     my @perm;
 994     for my $i (0..@{$self->{probA}}) {
 995         push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
 996         push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
 997         push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
 998         push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
 999         push @{$perm[$i]},'N' if  ($seq[$i] eq 'N');
1000     }
1001     my $fpos=shift @perm;
1002     my @strings=@$fpos;
1003     foreach my $pos (@perm) {
1004         my @newstr;
1005         foreach my $let (@$pos) {
1006             foreach my $string (@strings) {
1007                 my $newstring = $string . $let;
1008                 push @newstr,$newstring;
1009             }
1010         }
1011         @strings=@newstr;
1012     }
1013         return @strings;
1014 }
1015
1016 1;