1 package CXGN
::BreedersToolbox
::StocksFuzzySearch
;
5 CXGN::BreedersToolbox::StocksFuzzySearch - an object to find approximate matches in the database to a query list of stock names.
9 my $fuzzy_stock_search = CXGN::BreedersToolbox::StocksFuzzySearch->new({schema => $schema});
10 my $fuzzy_search_result = $fuzzy_stock_search->get_matches(\@stock_list, $max_distance, $stock_type);
17 Jeremy D. Edwards (jde22@cornell.edu)
24 use MooseX
::FollowPBP
;
25 use Moose
::Util
::TypeConstraints
;
26 use CXGN
::String
::FuzzyMatch
;
27 use SGN
::Model
::Cvterm
;
32 isa
=> 'DBIx::Class::Schema',
39 my $stock_list_ref = shift;
40 my $max_distance = shift;
41 my $stock_type = shift;
42 my $schema = $self->get_schema();
43 my @stock_list = @
{$stock_list_ref};
44 my %synonym_uniquename_lookup;
45 my $fuzzy_string_search = CXGN
::String
::FuzzyMatch
->new( { case_insensitive
=> 0} );
51 print STDERR
"FuzzySearch 1".localtime()."\n";
53 my $synonym_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, 'stock_synonym', 'stock_property')->cvterm_id();
54 my $stock_type_id = SGN
::Model
::Cvterm
->get_cvterm_row($schema, $stock_type, 'stock_type')->cvterm_id();
55 my $q = "SELECT stock.uniquename, stockprop.value, stockprop.type_id FROM stock LEFT JOIN stockprop USING(stock_id) WHERE stock.type_id=$stock_type_id";
56 my $h = $schema->storage->dbh()->prepare($q);
59 while (my ($uniquename, $synonym, $type_id) = $h->fetchrow_array()) {
60 $uniquename_hash{$uniquename}++;
62 if ($type_id == $synonym_type_id){
63 push @
{$synonym_uniquename_lookup{$synonym}}, $uniquename;
68 my @stock_names = keys %uniquename_hash;
69 my @synonym_names = keys %synonym_uniquename_lookup;
70 push (@stock_names, @synonym_names);
73 my %lowercase_name_lookup;
74 foreach (@stock_names){
75 push @lowercased_names, lc($_);
76 $lowercase_name_lookup{lc($_)} = $_;
79 print STDERR
"FuzzySearch 2".localtime()."\n";
81 foreach my $stock_name (@stock_list) {
83 if (exists($uniquename_hash{$stock_name})){
84 push @found_stocks, {matched_string
=> $stock_name, unique_name
=> $stock_name};
88 if (exists($synonym_uniquename_lookup{$stock_name})){
90 if (scalar(@
{$synonym_uniquename_lookup{$stock_name}}) > 1){
91 my $synonym_lookup_uniquename = join ',', @
{$synonym_uniquename_lookup{$stock_name}};
92 $error .= "This synonym $stock_name has more than one uniquename $synonym_lookup_uniquename. This should not happen!";
94 } elsif (scalar(@
{$synonym_uniquename_lookup{$stock_name}}) == 1){
95 $match_info{matched_string
} = $stock_name." (SYNONYM OF ".$synonym_uniquename_lookup{$stock_name}->[0].")";
96 $match_info{is_synonym
} = 1;
97 $match_info{unique_name
} = $synonym_uniquename_lookup{$stock_name}->[0];
99 push @found_stocks, \
%match_info;
103 my @search_stock_names;
104 foreach (@lowercased_names){
105 #if there is a difference in length greater than 10, it will not fuzzy search over that name
106 if (abs(length($_) - length($stock_name)) <= 10){
107 push @search_stock_names, $_;
111 my @stock_matches = @
{$fuzzy_string_search->get_matches(lc($stock_name), \
@search_stock_names, $max_distance)};
113 if (scalar(@stock_matches) == 0) {
114 push (@absent_stocks, $stock_name);
117 foreach (@stock_matches){
119 my $matched_name = $_->{string
};
120 my $original_matched_name = $lowercase_name_lookup{$matched_name};
121 $match_info{'name'} = $original_matched_name;
122 $match_info{'distance'} = $_->{distance
};
123 my $synonym_lookup_of_matched_string = $synonym_uniquename_lookup{$original_matched_name} || [];
124 if (scalar(@
$synonym_lookup_of_matched_string) > 1){
125 my $synonym_lookup_uniquename = join ',', @
$synonym_lookup_of_matched_string;
126 $error .= "This synonym $matched_name has more than one uniquename $synonym_lookup_uniquename. This should not happen!";
128 } elsif (scalar(@
$synonym_lookup_of_matched_string) == 1){
129 $match_info{'unique_names'} = [$original_matched_name];
130 $match_info{'is_synonym'} = 1;
131 $match_info{'synonym_of'} = $synonym_lookup_of_matched_string->[0];
133 $match_info{'unique_names'} = [$original_matched_name];
135 push @matches, \
%match_info;
137 push @fuzzy_stocks, {
145 print STDERR
"FUZZY ERRORS: $error\n";
146 $results{'error'} = $error;
149 $results{'found'} = \
@found_stocks;
150 $results{'fuzzy'} = \
@fuzzy_stocks;
151 $results{'absent'} = \
@absent_stocks;