4 package CXGN
::Cluster
::Match
;
9 my $self = bless {}, $class;
10 $self->set_debug($debug);
11 $self->set_word_size(10);
12 $self->set_min_match_length(10);
16 =head2 accessors get_query, set_query
29 return $self->{query
};
34 $self->{query
} = shift;
35 $self->debug("query: $self->{query}");
37 =head2 accessors get_subject, set_subject
49 return $self->{subject
};
54 $self->{subject
} = shift;
55 # $self->debug("subject: $self->{subject}");
59 =head2 accessors get_debug, set_debug
71 return $self->{debug
};
76 $self->{debug
} = shift;
93 if ($self->get_debug()) {
94 print STDERR
"$message\n";
98 =head2 accessors get_word_size, set_word_size
110 return $self->{word_size
};
115 $self->{word_size
} = shift;
118 =head2 accessors get_min_match_length, set_min_match_length
121 Desc: the minimal length of a match in bp to be retained
122 shorter matches will be discarded. Note that this cannot
123 be smaller than the word size.
130 sub get_min_match_length
{
132 return $self->{min_match_length
};
135 sub set_min_match_length
{
137 $self->{min_match_length
} = shift;
141 =head2 match_sequences
152 sub match_sequences
{
155 $self->debug("hashing query...");
156 my $query_hashref = $self->hash_matches($self->get_query());
158 $self->debug("hashing subject...");
159 my $subject_hashref = $self->hash_matches($self->get_subject());
163 # sort by coordinates: first produce flat list and then sort
165 foreach my $word (keys(%$query_hashref)) {
166 foreach my $query_start (@
{$query_hashref->{$word}}) {
167 push @match_list, [ $query_start, $word ];
170 my @sorted_match_list = sort { $a->[0] <=> $b->[0]; } @match_list;
171 my $largest_query_match_end = 0;
172 foreach my $match (@sorted_match_list) {
173 my $query_start = $match->[0];
174 my $word = $match->[1];
175 $self->debug("extending word $word at pos $query_start...");
176 my $query_match_start = undef;
177 my $query_match_end = undef;
178 my $subject_match_start = undef;
179 my $subject_match_end = undef;
180 if ( ($query_start < $largest_query_match_end)) {
181 print STDERR
"Skipping $word at $query_start because it is in a previous match\n";
185 foreach my $subject_start (@
{$subject_hashref->{$word}}) {
187 my $five_prime_match = $self->five_prime_extend_match($query_start, $subject_start);
188 my $three_prime_match = $self->three_prime_extend_match($query_start, $subject_start);
189 my $match= $five_prime_match.$word.$three_prime_match;
190 $self->debug("Match: $five_prime_match \| $word \| $three_prime_match");
192 $query_match_start = $query_start - length($five_prime_match) +1;
193 $query_match_end = $query_start + $self->get_word_size() + length($three_prime_match) +1;
194 $subject_match_start = $subject_start - length($five_prime_match) +1;
195 $subject_match_end = $subject_start + $self->get_word_size() + length($three_prime_match) +1;
198 if ($query_match_end - $query_match_start +1 > $self->get_min_match_length()) {
200 push @matches, [ $match,
203 $subject_match_start,
208 $self->debug("Skipping match because length < ".$self->get_min_match_length());
211 if ($largest_query_match_end < $query_match_end) {
212 $largest_query_match_end = $query_match_end;
221 sub five_prime_extend_match
{
224 $self->debug("five_prime_extend_match");
225 my $query_start = shift;
226 my $subject_start = shift;
228 my @query = split //, $self->get_query();
229 my @subject = split //, $self->get_subject();
230 my $five_prime_match = "";
235 while ( ($query_start - $i >= 0) && ($subject_start - $i >= 0)) {
236 $self->debug("comparing query_pos ".($query_start-$i)." ".($query[$query_start-$i])." with subject pos ".($subject_start-$i)." ".($subject[$subject_start-$i]));
238 my $score = $self->match_score($query[$query_start - $i], $subject[$subject_start -$i]);
239 my $match_score += $score;
240 $self->debug("score: $score. total match_score: $match_score");
241 if ($match_score >= 0) { $five_prime_match = $query[$query_start-$i].$five_prime_match; }
246 return $five_prime_match;
249 sub three_prime_extend_match
{
251 $self->debug("three_prime_extend_match");
252 my $query_start = shift;
253 my $subject_start = shift;
255 my @query = split //, $self->get_query();
256 my @subject = split //, $self->get_subject();
258 my $three_prime_match = "";
260 while ( ($query_start + $self->get_word_size() +$i < length($self->get_query())) && ($subject_start + $self->get_word_size() + $i < length($self->get_subject())) ) {
262 $self->debug("comparing query_pos ".($query_start+$i+$self->get_word_size())." ".($query[$query_start+$i+$self->get_word_size()])." with subject pos ".($subject_start+$i+$self->get_word_size())." ".($subject[$subject_start+$i+$self->get_word_size()]));
264 my $score = $self->match_score($query[$query_start + $self->get_word_size() + $i], $subject[$subject_start + $self->get_word_size() + $i]);
266 $match_score += $score;
267 $self->debug("score: $score. total match_score: $match_score");
269 $three_prime_match = $three_prime_match . $query[$query_start + $self->get_word_size() + $i];
274 return $three_prime_match;
284 $self->debug("match score: $q vs $s");
285 if ($q eq $s) { return 2; }
293 $self->debug("hashing matches...");
295 foreach my $k (0..(length($seq)-$self->get_word_size())) {
296 my $word = substr($seq, $k, $self->get_word_size());
297 #$self->debug("word: $word, pos: $k");
298 push @
{$hash{$word}}, $k;