a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / AnnotationConsensus.pm
blobadd3fa4e9611e5b80b8c10913b4cdb53b4feb403
1 package CXGN::Tools::AnnotationConsensus;
2 =head1 CXGN::Tools::AnnotationConsensus.pm
4 Given several annotations, this module implements a phrase-scoring
5 algorithm of my own design to find the "consensus" annotation: the
6 one which scores the highest based on similar phrases to other
7 annotations.
9 =head1 AUTHOR
11 C. Carpita <ccarpita@gmail.com>
13 =head1 Methods
15 =cut
17 #Clip all annotations to this length, for the sake of efficiency
18 our $CUTOFF = 300;
20 =head2 new()
22 Syn: Create a new consensus-calculator
23 Args: (Opt) An anonymous hash of $id=>$annotation
24 Ret: Instance of factory
26 =cut
28 sub new {
29 my $class = shift;
30 my $self = bless {}, $class;
31 my $annotations = shift;
32 $self->{annotations} = $annotations if(ref $annotations); # $id => $annotation
33 $self->{scores} = (); # $id => $score
35 $self->{tuples} = (); # $id => $tuples (combinations of phrases)
37 $self->{max_tuple_score} = 0;
38 $self->{max_tuple} = "";
40 return $self;
43 =head2 addAnnotation()
45 Syn: Add an annotation to the hash to be processed
46 Args: Id, Annotation (content)
47 Returns: Nothing
49 =cut
51 sub addAnnotation {
52 my $self = shift;
53 my ($id, $annotation) = @_;
54 die "Need ID and annotation" unless($id && $annotation);
55 $self->{annotations}->{$id} = $annotation;
58 =head2 calculate()
60 Syn: Runs the levenshtein algorithm on each possible pair, sums the total
61 edit score for each individual annotation. Lower # of edits = better,
62 like golf, or 301
63 Ret/Args: None
65 =cut
67 sub calculate {
68 my $self = shift;
69 my %scores = ();
70 print STDERR "...Calculating\n";
72 #Grab all the tuples (combinations of phrases) for each annotation
73 while(my ($id, $annot) = each %{$self->{annotations}}){
74 my @tuples = $self->_get_tuples($annot);
75 my %tuples = ();
76 $tuples{$_} = 1 foreach(@tuples);
77 $self->{tuples}->{$id} = \%tuples;
80 my %tup_copy = ();
81 %tup_copy = %{$self->{tuples}};
82 while(my ($id, $tup_hash) = each %{$self->{tuples}}){
83 print STDERR "#";
84 my $qscore = 0;
85 while(my ($q_id, $q_tup_hash) = each %tup_copy){
86 next if ($q_id eq $id);
87 $qscore += $self->_tuple_score($tup_hash, $q_tup_hash);
89 $scores{$id} = $qscore;
91 $self->{scores} = \%scores;
92 print STDERR "\n";
95 =head2 getConsensus()
97 Syn: Get the consensus annotation
98 Ret: An array of ($id, $annotation, $score)
99 Side: Runs $self->calculate() if no values exist in score hash
101 =cut
103 sub getConsensus {
104 my $self = shift;
105 $self->calculate() unless (values(%{$self->{scores}}));
106 my @ids = keys %{$self->{scores}};
107 my @sample = keys %{$self->{scores}};
108 my $s_id = pop @sample;
109 my $max = $self->{scores}->{$s_id};
110 my $max_id = $s_id;
111 while (my ($id, $score) = each %{$self->{scores}}){
112 if ($score > $max) {
113 $max_id = $id;
114 $max = $score;
117 return ($max_id, $self->{annotations}->{$max_id}, $max);
120 =head2 getScores()
122 Syn: Gets a hash of $id => $score
123 Side: Calculates if no values in score hash
125 =cut
127 sub getScores {
128 my $self = shift;
129 $self->calculate() unless (values(%{$self->{scores}}));
130 return $self->{scores};
133 sub getMaxTuple {
134 my $self = shift;
135 return $self->{max_tuple};
138 #Replaced levenshtein with my own algorithm, should be much faster
139 sub _get_tuples {
140 my $self = shift;
141 my $string = shift;
142 $string =~ s/\(.*?\)//g;
143 $string =~ s/\[.*?\]//g;
144 $string =~ s/;|,//g;
145 $string = lc($string); #Case-insensitive tuple-comparison
146 my @words = split /\s+/, $string;
148 my @tuples = ();
149 push(@tuples, @words);
151 my $t = 2;
152 while($t <= @words){
153 my $i = 0;
154 while($i < (@words - $t + 1)){
155 my @tuple = ();
156 for (my $k = $i; $k < $i + $t; $k++){
157 push(@tuple, $words[$k]);
159 push(@tuples, join " ", @tuple);
160 $i++;
162 $t++;
164 return @tuples;
167 sub _tuple_score {
168 my $self = shift;
169 my ($base_tup, $q_tup) = @_;
170 die "Arguments should be hash refs" unless (ref $base_tup && ref $q_tup);
171 my $score = 0;
172 my %base_tup = %$base_tup;
173 my %query_tup = %$q_tup;
174 while(my ($k, $v) = each %base_tup){
175 next unless $v;
176 my $num_spaces = 0;
177 $num_spaces++ while($k =~ / /g);
178 my $multiplier = ($num_spaces+1)**2;
179 if($query_tup{$k}){
180 $score += $multiplier;
181 if($multiplier > $self->{max_tuple_score}){
182 $self->{max_tuple} = $k;
186 return $score;
191 #Not using this anymore, way too slow:
192 sub _levenshtein {
193 my $self = shift;
194 my ($s1, $s2) = @_;
195 my $m = length $s1;
196 my $n = length $s2;
198 my %matrix = ();
199 my $matrix = \%matrix;
201 my ($i, $j);
202 for($i = 0; $i<=$m; $i++){
203 $matrix->{$i}->{0} = $i;
205 for($j = 0; $j<=$n; $j++){
206 $matrix->{0}->{$j} = $j;
209 for(my $i = 1; $i<=$m; $i++){
210 for(my $j = 1; $j<=$n; $j++){
211 my $cost;
212 if (lc(substr($s1, $i-1, 1)) eq lc(substr($s2, $j-1, 1))){
213 $cost = 0;
215 else {
216 $cost = 1;
218 $matrix->{$i}->{$j} = __minimum (
219 $matrix->{$i-1}->{$j} + 1,
220 $matrix->{$i}->{$j-1} + 1,
221 $matrix->{$i-1}->{$j-1} + $cost
225 return $matrix->{$m}->{$n};
228 sub __minimum {
229 my $min = shift;
230 foreach(@_){
231 $min = $_ if ($_ < $min);
233 return $min;