1 package CXGN
::Tools
::AnnotationConsensus
;
2 =head1 CXGN::Tools::AnnotationConsensus.pm
4 Given several annotations, this module implements a phrase-scoring
5 algorithm of my own design to find the "consensus" annotation: the
6 one which scores the highest based on similar phrases to other
11 C. Carpita <ccarpita@gmail.com>
17 #Clip all annotations to this length, for the sake of efficiency
22 Syn: Create a new consensus-calculator
23 Args: (Opt) An anonymous hash of $id=>$annotation
24 Ret: Instance of factory
30 my $self = bless {}, $class;
31 my $annotations = shift;
32 $self->{annotations
} = $annotations if(ref $annotations); # $id => $annotation
33 $self->{scores
} = (); # $id => $score
35 $self->{tuples
} = (); # $id => $tuples (combinations of phrases)
37 $self->{max_tuple_score
} = 0;
38 $self->{max_tuple
} = "";
43 =head2 addAnnotation()
45 Syn: Add an annotation to the hash to be processed
46 Args: Id, Annotation (content)
53 my ($id, $annotation) = @_;
54 die "Need ID and annotation" unless($id && $annotation);
55 $self->{annotations
}->{$id} = $annotation;
60 Syn: Runs the levenshtein algorithm on each possible pair, sums the total
61 edit score for each individual annotation. Lower # of edits = better,
70 print STDERR
"...Calculating\n";
72 #Grab all the tuples (combinations of phrases) for each annotation
73 while(my ($id, $annot) = each %{$self->{annotations
}}){
74 my @tuples = $self->_get_tuples($annot);
76 $tuples{$_} = 1 foreach(@tuples);
77 $self->{tuples
}->{$id} = \
%tuples;
81 %tup_copy = %{$self->{tuples
}};
82 while(my ($id, $tup_hash) = each %{$self->{tuples
}}){
85 while(my ($q_id, $q_tup_hash) = each %tup_copy){
86 next if ($q_id eq $id);
87 $qscore += $self->_tuple_score($tup_hash, $q_tup_hash);
89 $scores{$id} = $qscore;
91 $self->{scores
} = \
%scores;
97 Syn: Get the consensus annotation
98 Ret: An array of ($id, $annotation, $score)
99 Side: Runs $self->calculate() if no values exist in score hash
105 $self->calculate() unless (values(%{$self->{scores
}}));
106 my @ids = keys %{$self->{scores
}};
107 my @sample = keys %{$self->{scores
}};
108 my $s_id = pop @sample;
109 my $max = $self->{scores
}->{$s_id};
111 while (my ($id, $score) = each %{$self->{scores
}}){
117 return ($max_id, $self->{annotations
}->{$max_id}, $max);
122 Syn: Gets a hash of $id => $score
123 Side: Calculates if no values in score hash
129 $self->calculate() unless (values(%{$self->{scores
}}));
130 return $self->{scores
};
135 return $self->{max_tuple
};
138 #Replaced levenshtein with my own algorithm, should be much faster
142 $string =~ s/\(.*?\)//g;
143 $string =~ s/\[.*?\]//g;
145 $string = lc($string); #Case-insensitive tuple-comparison
146 my @words = split /\s+/, $string;
149 push(@tuples, @words);
154 while($i < (@words - $t + 1)){
156 for (my $k = $i; $k < $i + $t; $k++){
157 push(@tuple, $words[$k]);
159 push(@tuples, join " ", @tuple);
169 my ($base_tup, $q_tup) = @_;
170 die "Arguments should be hash refs" unless (ref $base_tup && ref $q_tup);
172 my %base_tup = %$base_tup;
173 my %query_tup = %$q_tup;
174 while(my ($k, $v) = each %base_tup){
177 $num_spaces++ while($k =~ / /g);
178 my $multiplier = ($num_spaces+1)**2;
180 $score += $multiplier;
181 if($multiplier > $self->{max_tuple_score
}){
182 $self->{max_tuple
} = $k;
191 #Not using this anymore, way too slow:
199 my $matrix = \
%matrix;
202 for($i = 0; $i<=$m; $i++){
203 $matrix->{$i}->{0} = $i;
205 for($j = 0; $j<=$n; $j++){
206 $matrix->{0}->{$j} = $j;
209 for(my $i = 1; $i<=$m; $i++){
210 for(my $j = 1; $j<=$n; $j++){
212 if (lc(substr($s1, $i-1, 1)) eq lc(substr($s2, $j-1, 1))){
218 $matrix->{$i}->{$j} = __minimum
(
219 $matrix->{$i-1}->{$j} + 1,
220 $matrix->{$i}->{$j-1} + 1,
221 $matrix->{$i-1}->{$j-1} + $cost
225 return $matrix->{$m}->{$n};
231 $min = $_ if ($_ < $min);