1 package CXGN
::BreedersToolbox
::StocksFuzzySearch
;
5 CXGN::BreedersToolbox::StocksFuzzySearch - an object to find approximate matches in the database to a query list of stock names.
9 my $fuzzy_stock_search = CXGN::BreedersToolbox::StocksFuzzySearch->new({schema => $schema});
10 my $fuzzy_search_result = $fuzzy_stock_search->get_matches(\@stock_list, $max_distance, $stock_type);
17 Jeremy D. Edwards (jde22@cornell.edu)
24 use MooseX
::FollowPBP
;
25 use Moose
::Util
::TypeConstraints
;
26 use CXGN
::String
::FuzzyMatch
;
27 use SGN
::Model
::Cvterm
;
32 isa
=> 'DBIx::Class::Schema',
39 my $stock_list_ref = shift;
40 my $max_distance = shift;
41 my $stock_type = shift;
42 my $schema = $self->get_schema();
43 my @stock_list = @
{$stock_list_ref};
44 my %synonym_uniquename_lookup;
45 my $fuzzy_string_search = CXGN
::String
::FuzzyMatch
->new( { case_insensitive
=> 0} );
51 print STDERR
"FuzzySearch 1".localtime()."\n";
53 my $synonym_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'stock_synonym', 'stock_property')->cvterm_id();
54 my $stock_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, $stock_type, 'stock_type')->cvterm_id();
55 my $q = "SELECT stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE stock.type_id=$stock_type_id";
56 my $h = $schema->storage->dbh()->prepare($q);
59 while (my ($uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
60 $uniquename_hash{$uniquename}++;
62 if ($type_id == $synonym_type_id){
63 push @
{$synonym_uniquename_lookup{$synonym}}, $uniquename;
68 my @stock_names = keys %uniquename_hash;
69 my @synonym_names = keys %synonym_uniquename_lookup;
70 push (@stock_names, @synonym_names);
73 my %lowercase_name_lookup;
74 foreach (@stock_names){
75 push @lowercased_names, lc($_);
76 $lowercase_name_lookup{lc($_)} = $_;
79 my @lowercased_synonyms;
80 my %lowercase_synonym_lookup;
81 foreach (@synonym_names){
82 push @lowercased_synonyms, lc($_);
83 $lowercase_synonym_lookup{lc($_)} = $_;
85 print STDERR
"FuzzySearch 2".localtime()."\n";
87 foreach my $stock_name (@stock_list) {
88 #lookup case insensitive stock names#
89 my $lc_name = lc($stock_name);
90 if (exists($lowercase_name_lookup{$lc_name})){
91 my $uniquename = $lowercase_name_lookup{$lc_name};
92 push @found_stocks, {matched_string
=> $stock_name, unique_name
=> $uniquename};
95 #lookup cases insensitive stock synonyms#
96 if (exists($lowercase_synonym_lookup{$lc_name})){
98 if (scalar(@
{$lowercase_synonym_lookup{$lc_name}}) > 1){
99 my $synonym_lookup_uniquename = join ',', @
{$lowercase_synonym_lookup{$lc_name}};
100 $error .= "This synonym $stock_name has more than one uniquename $synonym_lookup_uniquename. This should not happen!";
102 } elsif (scalar(@
{$lowercase_synonym_lookup{$lc_name}}) == 1){
103 $match_info{matched_string
} = $stock_name." (SYNONYM OF ".$lowercase_synonym_lookup{$lc_name}->[0].")";
104 $match_info{is_synonym
} = 1;
105 $match_info{unique_name
} = $lowercase_synonym_lookup{$lc_name}->[0];
107 push @found_stocks, \
%match_info;
111 my @search_stock_names;
112 foreach (@lowercased_names){
113 #if there is a difference in length greater than 10, it will not fuzzy search over that name
114 if (abs(length($_) - length($stock_name)) <= 10){
115 push @search_stock_names, $_;
119 #####case-sensitive matches are exact_match
120 my @stock_matches = @
{$fuzzy_string_search->get_matches(lc($stock_name), \
@search_stock_names, $max_distance)};
122 if (scalar(@stock_matches) == 0) {
123 push (@absent_stocks, $stock_name);
126 foreach (@stock_matches){
128 my $matched_name = $_->{string
};
129 my $original_matched_name = $lowercase_name_lookup{$matched_name};
130 $match_info{'name'} = $original_matched_name;
131 $match_info{'distance'} = $_->{distance
};
132 my $synonym_lookup_of_matched_string = $synonym_uniquename_lookup{$original_matched_name} || [];
133 if (scalar(@
$synonym_lookup_of_matched_string) > 1){
134 my $synonym_lookup_uniquename = join ',', @
$synonym_lookup_of_matched_string;
135 $error .= "This synonym $matched_name has more than one uniquename $synonym_lookup_uniquename. This should not happen!";
137 } elsif (scalar(@
$synonym_lookup_of_matched_string) == 1){
138 $match_info{'unique_names'} = [$original_matched_name];
139 $match_info{'is_synonym'} = 1;
140 $match_info{'synonym_of'} = $synonym_lookup_of_matched_string->[0];
142 $match_info{'unique_names'} = [$original_matched_name];
144 push @matches, \
%match_info;
146 push @fuzzy_stocks, {
154 print STDERR
"FUZZY ERRORS: $error\n";
155 $results{'error'} = $error;
158 $results{'found'} = \
@found_stocks;
159 $results{'fuzzy'} = \
@fuzzy_stocks;
160 $results{'absent'} = \
@absent_stocks;