Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / lib / CXGN / Cluster / Match.pm
blob81d9237ee967d2aa9b62a9ce654a78fd78dafd74
2 use strict;
4 package CXGN::Cluster::Match;
6 sub new {
7 my $class = shift;
8 my $debug = shift;
9 my $self = bless {}, $class;
10 $self->set_debug($debug);
11 $self->set_word_size(10);
12 $self->set_min_match_length(10);
13 return $self;
16 =head2 accessors get_query, set_query
18 Usage:
19 Desc:
20 Property
21 Side Effects:
22 Example:
24 =cut
26 sub get_query {
27 my $self = shift;
29 return $self->{query};
32 sub set_query {
33 my $self = shift;
34 $self->{query} = shift;
35 $self->debug("query: $self->{query}");
37 =head2 accessors get_subject, set_subject
39 Usage:
40 Desc:
41 Property
42 Side Effects:
43 Example:
45 =cut
47 sub get_subject {
48 my $self = shift;
49 return $self->{subject};
52 sub set_subject {
53 my $self = shift;
54 $self->{subject} = shift;
55 # $self->debug("subject: $self->{subject}");
59 =head2 accessors get_debug, set_debug
61 Usage:
62 Desc:
63 Property
64 Side Effects:
65 Example:
67 =cut
69 sub get_debug {
70 my $self = shift;
71 return $self->{debug};
74 sub set_debug {
75 my $self = shift;
76 $self->{debug} = shift;
79 =head2 debug
81 Usage:
82 Desc:
83 Ret:
84 Args:
85 Side Effects:
86 Example:
88 =cut
90 sub debug {
91 my $self = shift;
92 my $message = shift;
93 if ($self->get_debug()) {
94 print STDERR "$message\n";
98 =head2 accessors get_word_size, set_word_size
100 Usage:
101 Desc:
102 Property
103 Side Effects:
104 Example:
106 =cut
108 sub get_word_size {
109 my $self = shift;
110 return $self->{word_size};
113 sub set_word_size {
114 my $self = shift;
115 $self->{word_size} = shift;
118 =head2 accessors get_min_match_length, set_min_match_length
120 Usage:
121 Desc: the minimal length of a match in bp to be retained
122 shorter matches will be discarded. Note that this cannot
123 be smaller than the word size.
124 Property
125 Side Effects:
126 Example:
128 =cut
130 sub get_min_match_length {
131 my $self = shift;
132 return $self->{min_match_length};
135 sub set_min_match_length {
136 my $self = shift;
137 $self->{min_match_length} = shift;
141 =head2 match_sequences
143 Usage:
144 Desc:
145 Ret:
146 Args:
147 Side Effects:
148 Example:
150 =cut
152 sub match_sequences {
153 my $self = shift;
155 $self->debug("hashing query...");
156 my $query_hashref = $self->hash_matches($self->get_query());
158 $self->debug("hashing subject...");
159 my $subject_hashref = $self->hash_matches($self->get_subject());
161 my @matches = ();
163 # sort by coordinates: first produce flat list and then sort
164 my @match_list = ();
165 foreach my $word (keys(%$query_hashref)) {
166 foreach my $query_start (@{$query_hashref->{$word}}) {
167 push @match_list, [ $query_start, $word ];
170 my @sorted_match_list = sort { $a->[0] <=> $b->[0]; } @match_list;
171 my $largest_query_match_end = 0;
172 foreach my $match (@sorted_match_list) {
173 my $query_start = $match->[0];
174 my $word = $match->[1];
175 $self->debug("extending word $word at pos $query_start...");
176 my $query_match_start = undef;
177 my $query_match_end = undef;
178 my $subject_match_start = undef;
179 my $subject_match_end = undef;
180 if ( ($query_start < $largest_query_match_end)) {
181 print STDERR "Skipping $word at $query_start because it is in a previous match\n";
182 next;
184 else {
185 foreach my $subject_start (@{$subject_hashref->{$word}}) {
187 my $five_prime_match = $self->five_prime_extend_match($query_start, $subject_start);
188 my $three_prime_match = $self->three_prime_extend_match($query_start, $subject_start);
189 my $match= $five_prime_match.$word.$three_prime_match;
190 $self->debug("Match: $five_prime_match \| $word \| $three_prime_match");
192 $query_match_start = $query_start - length($five_prime_match) +1;
193 $query_match_end = $query_start + $self->get_word_size() + length($three_prime_match) +1;
194 $subject_match_start = $subject_start - length($five_prime_match) +1;
195 $subject_match_end = $subject_start + $self->get_word_size() + length($three_prime_match) +1;
198 if ($query_match_end - $query_match_start +1 > $self->get_min_match_length()) {
200 push @matches, [ $match,
201 $query_match_start,
202 $query_match_end,
203 $subject_match_start,
204 $subject_match_end
207 else {
208 $self->debug("Skipping match because length < ".$self->get_min_match_length());
211 if ($largest_query_match_end < $query_match_end) {
212 $largest_query_match_end = $query_match_end;
218 return @matches;
221 sub five_prime_extend_match {
222 my $self = shift;
224 $self->debug("five_prime_extend_match");
225 my $query_start = shift;
226 my $subject_start = shift;
228 my @query = split //, $self->get_query();
229 my @subject = split //, $self->get_subject();
230 my $five_prime_match = "";
231 my $match_score = 1;
232 my $i = 1;
235 while ( ($query_start - $i >= 0) && ($subject_start - $i >= 0)) {
236 $self->debug("comparing query_pos ".($query_start-$i)." ".($query[$query_start-$i])." with subject pos ".($subject_start-$i)." ".($subject[$subject_start-$i]));
238 my $score = $self->match_score($query[$query_start - $i], $subject[$subject_start -$i]);
239 my $match_score += $score;
240 $self->debug("score: $score. total match_score: $match_score");
241 if ($match_score >= 0) { $five_prime_match = $query[$query_start-$i].$five_prime_match; }
242 else { last; }
243 $i++;
246 return $five_prime_match;
249 sub three_prime_extend_match {
250 my $self = shift;
251 $self->debug("three_prime_extend_match");
252 my $query_start = shift;
253 my $subject_start = shift;
255 my @query = split //, $self->get_query();
256 my @subject = split //, $self->get_subject();
257 my $match_score = 1;
258 my $three_prime_match = "";
259 my $i = 1;
260 while ( ($query_start + $self->get_word_size() +$i < length($self->get_query())) && ($subject_start + $self->get_word_size() + $i < length($self->get_subject())) ) {
262 $self->debug("comparing query_pos ".($query_start+$i+$self->get_word_size())." ".($query[$query_start+$i+$self->get_word_size()])." with subject pos ".($subject_start+$i+$self->get_word_size())." ".($subject[$subject_start+$i+$self->get_word_size()]));
264 my $score = $self->match_score($query[$query_start + $self->get_word_size() + $i], $subject[$subject_start + $self->get_word_size() + $i]);
266 $match_score += $score;
267 $self->debug("score: $score. total match_score: $match_score");
268 if ($score > 0) {
269 $three_prime_match = $three_prime_match . $query[$query_start + $self->get_word_size() + $i];
271 else { last; }
272 $i++;
274 return $three_prime_match;
278 sub match_score {
279 my $self = shift;
281 my $q = shift;
282 my $s = shift;
284 $self->debug("match score: $q vs $s");
285 if ($q eq $s) { return 2; }
286 else { return -1; }
289 sub hash_matches {
290 my $self = shift;
292 my $seq = shift;
293 $self->debug("hashing matches...");
294 my %hash= ();
295 foreach my $k (0..(length($seq)-$self->get_word_size())) {
296 my $word = substr($seq, $k, $self->get_word_size());
297 #$self->debug("word: $word, pos: $k");
298 push @{$hash{$word}}, $k;
300 return \%hash;
307 return 1;