minor fixes
[sgn.git] / lib / CXGN / String / FuzzyMatch.pm
blob0c83cc0b93829ff2d82793a2572502cec758da91
1 package CXGN::String::FuzzyMatch;
3 =head1 NAME
5 CXGN::String::FuzzyMatch - an object to find approximate matches to a query string in an array of strings.
7 =head1 USAGE
9 my $fuzzy_string_search = CXGN::String::FuzzyMatch->new();
10 my @string_matches = @{$fuzzy_string_search->get_matches($query_string, \@string_array, $max_distance)};
12 =head1 DESCRIPTION
15 =head1 AUTHORS
17 Jeremy D. Edwards (jde22@cornell.edu)
19 =cut
21 use strict;
22 use warnings;
23 use String::Approx 'adistr';
24 use Moose;
25 use Data::Dumper;
27 has 'case_insensitive' => ( isa => 'Bool',
28 is => 'rw',
31 sub get_matches {
32 my $self = shift;
33 my $query_string = shift;
34 my $string_array_ref = shift;
35 my $max_distance = shift;
36 my @matches;
37 my @string_array = @{$string_array_ref};
38 my @distances;
39 my %string_distance_lookup;
40 my %string_length_difference_lookup;
41 my @strings_sorted_distance;
42 my @strings_sorted_length;
43 my $query_length = length $query_string;
45 if ($self->case_insensitive()) {
46 $query_string = uc($query_string);
47 @string_array = map { uc($_) } @string_array;
50 #no fuzzy search if max distance is 0
51 if ($max_distance == 0) {
52 for my $i (0 .. $#string_array) {
53 my $string_match = $string_array[$i];
54 if ($query_string eq $string_match) {
55 $string_length_difference_lookup{$string_match} = 0;
58 } else {
60 @distances = adistr($query_string, @string_array);
62 for my $i (0 .. $#string_array) {
63 my $distance = $distances[$i];
64 my $string_match = $string_array[$i];
65 if ($distance == 0) {
66 $string_length_difference_lookup{$string_match} = (length $string_match) - $query_length;
67 } elsif (abs($distance) <= $max_distance) {
68 $string_distance_lookup{$string_match}=$distance;
75 #get a list of strings sorted by their difference in length from the query
76 @strings_sorted_length = sort { abs($string_length_difference_lookup{$a}) <=> abs($string_length_difference_lookup{$b}) } keys(%string_length_difference_lookup);
78 #get a list of strings sorted by their distance from the query
79 @strings_sorted_distance = sort { abs($string_distance_lookup{$a}) <=> abs($string_distance_lookup{$b}) } keys(%string_distance_lookup);
81 foreach my $sorted_string (@strings_sorted_length) {
82 my %string_distance_result;
83 $string_distance_result{'string'} = $sorted_string;
84 $string_distance_result{'distance'} = 0;
85 push (@matches, \%string_distance_result);
88 foreach my $sorted_string (@strings_sorted_distance) {
89 my %string_distance_result;
90 $string_distance_result{'string'} = $sorted_string;
91 $string_distance_result{'distance'} = $string_distance_lookup{$sorted_string};
92 push (@matches, \%string_distance_result);
96 return \@matches;