lib/CXGN/Tools/AnnotationConsensus.pm

   1 package CXGN::Tools::AnnotationConsensus;
   2 =head1 CXGN::Tools::AnnotationConsensus.pm
   3
   4  Given several annotations, this module implements a phrase-scoring
   5  algorithm of my own design to find the "consensus" annotation: the
   6  one which scores the highest based on similar phrases to other
   7  annotations.
   8
   9 =head1 AUTHOR
  10
  11  C. Carpita <ccarpita@gmail.com>
  12
  13 =head1 Methods
  14
  15 =cut
  16
  17 #Clip all annotations to this length, for the sake of efficiency
  18 our $CUTOFF = 300;
  19
  20 =head2 new()
  21
  22  Syn: Create a new consensus-calculator
  23  Args: (Opt) An anonymous hash of $id=>$annotation
  24  Ret: Instance of factory
  25
  26 =cut
  27
  28 sub new {
  29         my $class = shift;
  30         my $self = bless {}, $class;
  31         my $annotations = shift;
  32         $self->{annotations} = $annotations if(ref $annotations); # $id => $annotation
  33         $self->{scores} = (); # $id => $score
  34
  35         $self->{tuples} = ();   # $id => $tuples (combinations of phrases)
  36
  37         $self->{max_tuple_score} = 0;
  38         $self->{max_tuple} = "";
  39
  40         return $self;
  41 }
  42
  43 =head2 addAnnotation()
  44
  45  Syn: Add an annotation to the hash to be processed
  46  Args: Id, Annotation (content)
  47  Returns: Nothing
  48
  49 =cut
  50
  51 sub addAnnotation {
  52         my $self = shift;
  53         my ($id, $annotation) = @_;
  54         die "Need ID and annotation" unless($id && $annotation);
  55         $self->{annotations}->{$id} = $annotation;
  56 }
  57
  58 =head2 calculate()
  59
  60  Syn: Runs the levenshtein algorithm on each possible pair, sums the total
  61       edit score for each individual annotation.  Lower # of edits = better,
  62           like golf, or 301
  63  Ret/Args: None
  64
  65 =cut
  66
  67 sub calculate {
  68         my $self = shift;
  69         my %scores = ();
  70         print STDERR "...Calculating\n";
  71
  72         #Grab all the tuples (combinations of phrases) for each annotation
  73         while(my ($id, $annot) = each %{$self->{annotations}}){
  74                 my @tuples = $self->_get_tuples($annot);
  75                 my %tuples = ();
  76                 $tuples{$_} = 1 foreach(@tuples);
  77                 $self->{tuples}->{$id} = \%tuples;
  78         }
  79
  80         my %tup_copy = ();
  81         %tup_copy = %{$self->{tuples}};
  82         while(my ($id, $tup_hash) = each %{$self->{tuples}}){
  83                 print STDERR "#";
  84                 my $qscore = 0;
  85                 while(my ($q_id, $q_tup_hash) = each %tup_copy){
  86                         next if ($q_id eq $id);
  87                                 $qscore += $self->_tuple_score($tup_hash, $q_tup_hash);
  88                 }
  89                 $scores{$id} = $qscore;
  90         }
  91         $self->{scores} = \%scores;
  92         print STDERR "\n";
  93 }
  94
  95 =head2 getConsensus()
  96
  97  Syn: Get the consensus annotation
  98  Ret: An array of ($id, $annotation, $score)
  99  Side: Runs $self->calculate() if no values exist in score hash
 100
 101 =cut
 102
 103 sub getConsensus {
 104         my $self = shift;
 105         $self->calculate() unless (values(%{$self->{scores}}));
 106         my @ids = keys %{$self->{scores}};
 107         my @sample = keys %{$self->{scores}};
 108         my $s_id = pop @sample;
 109         my $max = $self->{scores}->{$s_id};
 110         my $max_id = $s_id;
 111         while (my ($id, $score) = each %{$self->{scores}}){
 112                 if ($score > $max) {
 113                         $max_id = $id;
 114                         $max = $score;
 115                 }
 116         }
 117         return ($max_id, $self->{annotations}->{$max_id}, $max);
 118 }
 119
 120 =head2 getScores()
 121
 122  Syn: Gets a hash of $id => $score
 123  Side: Calculates if no values in score hash
 124
 125 =cut
 126
 127 sub getScores {
 128         my $self = shift;
 129         $self->calculate() unless (values(%{$self->{scores}}));
 130         return $self->{scores};
 131 }
 132
 133 sub getMaxTuple {
 134         my $self = shift;
 135         return $self->{max_tuple};
 136 }
 137
 138 #Replaced levenshtein with my own algorithm, should be much faster
 139 sub _get_tuples {
 140         my $self = shift;
 141         my $string = shift;
 142         $string =~ s/\(.*?\)//g;
 143         $string =~ s/\[.*?\]//g;
 144         $string =~ s/;|,//g;
 145         $string = lc($string);  #Case-insensitive tuple-comparison
 146         my @words = split /\s+/, $string;
 147
 148         my @tuples = ();
 149         push(@tuples, @words);
 150
 151         my $t = 2;
 152         while($t <= @words){
 153                 my $i = 0;
 154                 while($i < (@words - $t + 1)){
 155                         my @tuple = ();
 156                         for (my $k = $i; $k < $i + $t; $k++){
 157                                 push(@tuple, $words[$k]);
 158                         }
 159                         push(@tuples, join " ", @tuple);
 160                         $i++;
 161                 }
 162                 $t++;
 163         }
 164         return @tuples;
 165 }
 166
 167 sub _tuple_score {
 168         my $self = shift;
 169         my ($base_tup, $q_tup) = @_;
 170         die "Arguments should be hash refs" unless (ref $base_tup && ref $q_tup);
 171         my $score = 0;
 172         my %base_tup = %$base_tup;
 173         my %query_tup = %$q_tup;
 174         while(my ($k, $v) = each %base_tup){
 175                 next unless $v;
 176                 my $num_spaces = 0;
 177                 $num_spaces++ while($k =~ / /g);
 178                 my $multiplier = ($num_spaces+1)**2;
 179                 if($query_tup{$k}){
 180                         $score += $multiplier;
 181                         if($multiplier > $self->{max_tuple_score}){
 182                                 $self->{max_tuple} = $k;
 183                         }
 184                 }
 185         }
 186         return $score;
 187 }
 188
 189
 190
 191 #Not using this anymore, way too slow:
 192 sub _levenshtein {
 193         my $self = shift;
 194         my ($s1, $s2) = @_;
 195         my $m = length $s1;
 196         my $n = length $s2;
 197
 198         my %matrix = ();
 199         my $matrix = \%matrix;
 200
 201         my ($i, $j);
 202         for($i = 0; $i<=$m; $i++){
 203                 $matrix->{$i}->{0} = $i;
 204         }
 205         for($j = 0; $j<=$n; $j++){
 206                 $matrix->{0}->{$j} = $j;
 207         }
 208
 209         for(my $i = 1; $i<=$m; $i++){
 210                 for(my $j = 1; $j<=$n; $j++){
 211                         my $cost;
 212                         if (lc(substr($s1, $i-1, 1)) eq lc(substr($s2, $j-1, 1))){
 213                                 $cost = 0;
 214                         }
 215                         else {
 216                                 $cost = 1;
 217                         }
 218                         $matrix->{$i}->{$j} = __minimum (
 219                                 $matrix->{$i-1}->{$j} + 1,
 220                                 $matrix->{$i}->{$j-1} + 1,
 221                                 $matrix->{$i-1}->{$j-1} + $cost
 222                         );
 223                 }
 224         }
 225         return $matrix->{$m}->{$n};
 226 }
 227
 228 sub __minimum {
 229         my $min = shift;
 230         foreach(@_){
 231                 $min = $_ if ($_ < $min);
 232         }
 233         return $min;
 234 }
 235
 236
 237
 238 1;